From 74b2107543da4ed9607ec484f63c42362dc9fca6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Apr 2011 12:02:53 -0400 Subject: Btrfs: make sure to use the delalloc reserve when filling delalloc In the prealloc filling code and compressed code we don't set trans->block_rsv to the delalloc block reserve properly, which is going to make us use metadata from the wrong pool, this patch fixes that. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7cd8ab0ef04d..3b9f1643aa57 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -619,6 +619,7 @@ retry: trans = btrfs_join_transaction(root, 1); BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, @@ -1060,6 +1061,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, trans = btrfs_join_transaction(root, 1); } BUG_ON(IS_ERR(trans)); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; cow_start = (u64)-1; cur_offset = start; -- cgit v1.2.3 From 7a7eaa40a39bde4eefc91aadeb1ce3dc4e6a1252 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Apr 2011 12:54:33 -0400 Subject: Btrfs: take away the num_items argument from btrfs_join_transaction I keep forgetting that btrfs_join_transaction() just ignores the num_items argument, which leads me to sending pointless patches and looking stupid :). So just kill the num_items argument from btrfs_join_transaction and btrfs_start_ioctl_transaction, since neither of them use it. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 6 +++--- fs/btrfs/extent-tree.c | 12 ++++++------ fs/btrfs/inode.c | 34 +++++++++++++++++----------------- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/relocation.c | 12 ++++++------ fs/btrfs/transaction.c | 13 +++++-------- fs/btrfs/transaction.h | 9 +++------ 7 files changed, 42 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 228cf36ece83..9d6c9e332ca3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1568,7 +1568,7 @@ static int transaction_kthread(void *arg) transid = cur->transid; spin_unlock(&root->fs_info->new_trans_lock); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); if (transid == trans->transid) { ret = btrfs_commit_transaction(trans, root); @@ -2495,13 +2495,13 @@ int btrfs_commit_super(struct btrfs_root *root) down_write(&root->fs_info->cleanup_work_sem); up_write(&root->fs_info->cleanup_work_sem); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_commit_transaction(trans, root); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9ee6bd55e16c..941b28e78931 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3174,7 +3174,7 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -3202,7 +3202,7 @@ alloc: commit_trans: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); @@ -3589,7 +3589,7 @@ again: goto out; ret = -ENOSPC; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto out; ret = btrfs_commit_transaction(trans, root); @@ -3816,7 +3816,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, if (trans) return -EAGAIN; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_commit_transaction(trans, root); return 0; @@ -7649,7 +7649,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) BUG_ON(reloc_root->commit_root != NULL); while (1) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); mutex_lock(&root->fs_info->drop_mutex); @@ -8176,7 +8176,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); alloc_flags = update_block_group_flags(root, cache->flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3b9f1643aa57..e47bdf0fb75a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -420,7 +420,7 @@ again: } } if (start == 0) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -617,7 +617,7 @@ retry: async_extent->start + async_extent->ram_size - 1, GFP_NOFS); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_reserve_extent(trans, root, @@ -779,7 +779,7 @@ static noinline int cow_file_range(struct inode *inode, int ret = 0; BUG_ON(root == root->fs_info->tree_root); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1056,9 +1056,9 @@ static noinline int run_delalloc_nocow(struct inode *inode, BUG_ON(!path); if (root == root->fs_info->tree_root) { nolock = true; - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); } else { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); } BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1718,9 +1718,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1735,9 +1735,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 0, &cached_state, GFP_NOFS); if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -2415,7 +2415,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) (u64)-1); if (root->orphan_block_rsv || root->orphan_item_inserted) { - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans, root); } @@ -4378,9 +4378,9 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) { if (nolock) - trans = btrfs_join_transaction_nolock(root, 1); + trans = btrfs_join_transaction_nolock(root); else - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); btrfs_set_trans_block_group(trans, inode); @@ -4407,7 +4407,7 @@ void btrfs_dirty_inode(struct inode *inode) if (BTRFS_I(inode)->dummy_inode) return; - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); @@ -5226,7 +5226,7 @@ again: free_extent_map(em); em = NULL; btrfs_release_path(root, path); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return ERR_CAST(trans); goto again; @@ -5470,7 +5470,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, btrfs_drop_extent_cache(inode, start, start + len - 1, 0); } - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return ERR_CAST(trans); @@ -5703,7 +5703,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, * to make sure the current transaction stays open * while we look for nocow cross refs */ - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) goto must_cow; @@ -5841,7 +5841,7 @@ again: BUG_ON(!ordered); - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { err = -ENOMEM; goto out; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2616f7ed4799..908c3d4b48c6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -242,7 +242,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } - trans = btrfs_join_transaction(root, 1); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); ret = btrfs_update_inode(trans, root, inode); @@ -2182,7 +2182,7 @@ static long btrfs_ioctl_trans_start(struct file *file) mutex_unlock(&root->fs_info->trans_mutex); ret = -ENOMEM; - trans = btrfs_start_ioctl_transaction(root, 0); + trans = btrfs_start_ioctl_transaction(root); if (IS_ERR(trans)) goto out_drop; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 199a80134312..8bb256667f2d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2149,7 +2149,7 @@ again: err = ret; } - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { if (!err) btrfs_block_rsv_release(rc->extent_root, @@ -3233,7 +3233,7 @@ truncate: goto out; } - trans = btrfs_join_transaction(root, 0); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_free_path(path); ret = PTR_ERR(trans); @@ -3642,7 +3642,7 @@ int prepare_to_relocate(struct reloc_control *rc) rc->create_reloc_tree = 1; set_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); BUG_ON(IS_ERR(trans)); btrfs_commit_transaction(trans, rc->extent_root); return 0; @@ -3831,7 +3831,7 @@ restart: btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); /* get rid of pinned extents */ - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) err = PTR_ERR(trans); else @@ -4156,7 +4156,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) set_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { unset_reloc_control(rc); err = PTR_ERR(trans); @@ -4190,7 +4190,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) unset_reloc_control(rc); - trans = btrfs_join_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) err = PTR_ERR(trans); else diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c571734d5e5a..70bfb26df967 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -257,22 +257,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, { return start_transaction(root, num_items, TRANS_START); } -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks) +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN); } -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, - int num_blocks) +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOLOCK); } -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks) +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(r, 0, TRANS_USERSPACE); + return start_transaction(root, 0, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ @@ -1171,7 +1168,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, INIT_DELAYED_WORK(&ac->work, do_async_commit); ac->root = root; - ac->newtrans = btrfs_join_transaction(root, 0); + ac->newtrans = btrfs_join_transaction(root); if (IS_ERR(ac->newtrans)) { int err = PTR_ERR(ac->newtrans); kfree(ac); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e441acc6c584..1f573f09dba2 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -92,12 +92,9 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); -struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks); -struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, - int num_blocks); -struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); -- cgit v1.2.3 From 2a1eb4614d984d5cd4c928784e9afcf5c07f93be Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 13 Apr 2011 15:15:59 -0400 Subject: Btrfs: if we've already started a trans handle, use that one We currently track trans handles in current->journal_info, but we don't actually use it. This patch fixes it. This will cover the case where we have multiple people starting transactions down the call chain. This keeps us from having to allocate a new handle and all of that, we just increase the use count of the current handle, save the old block_rsv, and return. I tested this with xfstests and it worked out fine. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 17 +++++++++++++++++ fs/btrfs/transaction.h | 2 ++ 2 files changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 70bfb26df967..46f40564c168 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -184,6 +184,15 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) return ERR_PTR(-EROFS); + + if (current->journal_info) { + WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK); + h = current->journal_info; + h->use_count++; + h->orig_rsv = h->block_rsv; + h->block_rsv = NULL; + goto got_it; + } again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) @@ -213,7 +222,9 @@ again: h->block_group = 0; h->bytes_reserved = 0; h->delayed_ref_updates = 0; + h->use_count = 1; h->block_rsv = NULL; + h->orig_rsv = NULL; smp_mb(); if (cur_trans->blocked && may_wait_transaction(root, type)) { @@ -241,6 +252,7 @@ again: } } +got_it: if (type != TRANS_JOIN_NOLOCK) mutex_lock(&root->fs_info->trans_mutex); record_root_in_trans(h, root); @@ -428,6 +440,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_fs_info *info = root->fs_info; int count = 0; + if (--trans->use_count) { + trans->block_rsv = trans->orig_rsv; + return 0; + } + while (count < 4) { unsigned long cur = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1f573f09dba2..154314f80f8d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,11 +47,13 @@ struct btrfs_trans_handle { u64 transid; u64 block_group; u64 bytes_reserved; + unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; unsigned long delayed_ref_updates; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *orig_rsv; }; struct btrfs_pending_snapshot { -- cgit v1.2.3 From a4abeea41adfa3c143c289045f4625dfaeba2212 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Apr 2011 17:25:13 -0400 Subject: Btrfs: kill trans_mutex We use trans_mutex for lots of things, here's a basic list 1) To serialize trans_handles joining the currently running transaction 2) To make sure that no new trans handles are started while we are committing 3) To protect the dead_roots list and the transaction lists Really the serializing trans_handles joining is not too hard, and can really get bogged down in acquiring a reference to the transaction. So replace the trans_mutex with a trans_lock spinlock and use it to do the following 1) Protect fs_info->running_transaction. All trans handles have to do is check this, and then take a reference of the transaction and keep on going. 2) Protect the fs_info->trans_list. This doesn't get used too much, basically it just holds the current transactions, which will usually just be the currently committing transaction and the currently running transaction at most. 3) Protect the dead roots list. This is only ever processed by splicing the list so this is relatively simple. 4) Protect the fs_info->reloc_ctl stuff. This is very lightweight and was using the trans_mutex before, so this is a pretty straightforward change. 5) Protect fs_info->no_trans_join. Because we don't hold the trans_lock over the entirety of the commit we need to have a way to block new people from creating a new transaction while we're doing our work. So we set no_trans_join and in join_transaction we test to see if that is set, and if it is we do a wait_on_commit. 6) Make the transaction use count atomic so we don't need to take locks to modify it when we're dropping references. 7) Add a commit_lock to the transaction to make sure multiple people trying to commit the same transaction don't race and commit at the same time. 8) Make open_ioctl_trans an atomic so we don't have to take any locks for ioctl trans. I have tested this with xfstests, but obviously it is a pretty hairy change so lots of testing is greatly appreciated. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 6 +- fs/btrfs/disk-io.c | 30 +++--- fs/btrfs/extent-tree.c | 3 +- fs/btrfs/file.c | 4 +- fs/btrfs/ioctl.c | 12 +-- fs/btrfs/relocation.c | 16 +-- fs/btrfs/transaction.c | 271 ++++++++++++++++++++++++++----------------------- fs/btrfs/transaction.h | 4 +- 8 files changed, 177 insertions(+), 169 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8f4b81de3ae2..522a39b0033d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -919,7 +919,6 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - u64 open_ioctl_trans; unsigned long mount_opt:20; unsigned long compress_type:4; u64 max_inline; @@ -936,7 +935,6 @@ struct btrfs_fs_info { struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; - struct mutex trans_mutex; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; @@ -957,6 +955,7 @@ struct btrfs_fs_info { struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; + spinlock_t trans_lock; struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -969,6 +968,7 @@ struct btrfs_fs_info { atomic_t async_submit_draining; atomic_t nr_async_bios; atomic_t async_delalloc_pages; + atomic_t open_ioctl_trans; /* * this is used by the balancing code to wait for all the pending @@ -1032,6 +1032,7 @@ struct btrfs_fs_info { int closing; int log_root_recovering; int enospc_unlink; + int trans_no_join; u64 total_pinned; @@ -1053,7 +1054,6 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; spinlock_t delalloc_lock; - spinlock_t new_trans_lock; u64 delalloc_bytes; /* data_alloc_cluster is only used in ssd mode */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9d6c9e332ca3..93ef254ec432 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1551,22 +1551,22 @@ static int transaction_kthread(void *arg) vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - spin_lock(&root->fs_info->new_trans_lock); + spin_lock(&root->fs_info->trans_lock); cur = root->fs_info->running_transaction; if (!cur) { - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); goto sleep; } now = get_seconds(); if (!cur->blocked && (now < cur->start_time || now - cur->start_time < 30)) { - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); delay = HZ * 5; goto sleep; } transid = cur->transid; - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); @@ -1658,7 +1658,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->ordered_operations); INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_lock); - spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->trans_lock); spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); @@ -1687,6 +1687,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->sb = sb; fs_info->max_inline = 8192 * 1024; fs_info->metadata_ratio = 0; + fs_info->trans_no_join = 0; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1735,7 +1736,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->do_barriers = 1; - mutex_init(&fs_info->trans_mutex); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->chunk_mutex); @@ -3006,10 +3006,13 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) WARN_ON(1); - mutex_lock(&root->fs_info->trans_mutex); mutex_lock(&root->fs_info->transaction_kthread_mutex); + spin_lock(&root->fs_info->trans_lock); list_splice_init(&root->fs_info->trans_list, &list); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + while (!list_empty(&list)) { t = list_entry(list.next, struct btrfs_transaction, list); if (!t) @@ -3034,23 +3037,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) t->blocked = 0; if (waitqueue_active(&root->fs_info->transaction_wait)) wake_up(&root->fs_info->transaction_wait); - mutex_unlock(&root->fs_info->trans_mutex); - mutex_lock(&root->fs_info->trans_mutex); t->commit_done = 1; if (waitqueue_active(&t->commit_wait)) wake_up(&t->commit_wait); - mutex_unlock(&root->fs_info->trans_mutex); - - mutex_lock(&root->fs_info->trans_mutex); btrfs_destroy_pending_snapshots(t); btrfs_destroy_delalloc_inodes(root); - spin_lock(&root->fs_info->new_trans_lock); + spin_lock(&root->fs_info->trans_lock); root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->new_trans_lock); + spin_unlock(&root->fs_info->trans_lock); btrfs_destroy_marked_extents(root, &t->dirty_pages, EXTENT_DIRTY); @@ -3064,8 +3062,10 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) kmem_cache_free(btrfs_transaction_cachep, t); } + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); mutex_unlock(&root->fs_info->transaction_kthread_mutex); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 941b28e78931..ca599654ce19 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3200,7 +3200,8 @@ alloc: /* commit the current transaction and try again */ commit_trans: - if (!committed && !root->fs_info->open_ioctl_trans) { + if (!committed && + !atomic_read(&root->fs_info->open_ioctl_trans)) { committed = 1; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 75899a01dded..cd5e82e500cf 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1222,14 +1222,12 @@ int btrfs_sync_file(struct file *file, int datasync) * the current transaction, we can bail out now without any * syncing */ - mutex_lock(&root->fs_info->trans_mutex); + smp_mb(); if (BTRFS_I(inode)->last_trans <= root->fs_info->last_trans_committed) { BTRFS_I(inode)->last_trans = 0; - mutex_unlock(&root->fs_info->trans_mutex); goto out; } - mutex_unlock(&root->fs_info->trans_mutex); /* * ok we haven't committed the transaction yet, lets do a commit diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 908c3d4b48c6..a578620e06a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2177,9 +2177,7 @@ static long btrfs_ioctl_trans_start(struct file *file) if (ret) goto out; - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans++; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_inc(&root->fs_info->open_ioctl_trans); ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root); @@ -2190,9 +2188,7 @@ static long btrfs_ioctl_trans_start(struct file *file) return 0; out_drop: - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans--; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_dec(&root->fs_info->open_ioctl_trans); mnt_drop_write(file->f_path.mnt); out: return ret; @@ -2426,9 +2422,7 @@ long btrfs_ioctl_trans_end(struct file *file) btrfs_end_transaction(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - root->fs_info->open_ioctl_trans--; - mutex_unlock(&root->fs_info->trans_mutex); + atomic_dec(&root->fs_info->open_ioctl_trans); mnt_drop_write(file->f_path.mnt); return 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8bb256667f2d..09c30d37d43e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2136,10 +2136,10 @@ int prepare_to_merge(struct reloc_control *rc, int err) u64 num_bytes = 0; int ret; - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); again: if (!err) { num_bytes = rc->merging_rsv_size; @@ -2208,9 +2208,9 @@ int merge_reloc_roots(struct reloc_control *rc) int ret; again: root = rc->extent_root; - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); list_splice_init(&rc->reloc_roots, &reloc_roots); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); while (!list_empty(&reloc_roots)) { found = 1; @@ -3583,17 +3583,17 @@ next: static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); fs_info->reloc_ctl = rc; - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); fs_info->reloc_ctl = NULL; - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); } static int check_extent_flags(u64 flags) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 46f40564c168..43816f8b23e7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -34,6 +34,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) { WARN_ON(atomic_read(&transaction->use_count) == 0); if (atomic_dec_and_test(&transaction->use_count)) { + BUG_ON(!list_empty(&transaction->list)); memset(transaction, 0, sizeof(*transaction)); kmem_cache_free(btrfs_transaction_cachep, transaction); } @@ -48,47 +49,73 @@ static noinline void switch_commit_root(struct btrfs_root *root) /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root) +static noinline int join_transaction(struct btrfs_root *root, int nofail) { struct btrfs_transaction *cur_trans; + + spin_lock(&root->fs_info->trans_lock); + if (root->fs_info->trans_no_join) { + if (!nofail) { + spin_unlock(&root->fs_info->trans_lock); + return -EBUSY; + } + } + cur_trans = root->fs_info->running_transaction; - if (!cur_trans) { - cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, - GFP_NOFS); - if (!cur_trans) - return -ENOMEM; - root->fs_info->generation++; - atomic_set(&cur_trans->num_writers, 1); - cur_trans->num_joined = 0; - cur_trans->transid = root->fs_info->generation; - init_waitqueue_head(&cur_trans->writer_wait); - init_waitqueue_head(&cur_trans->commit_wait); - cur_trans->in_commit = 0; - cur_trans->blocked = 0; - atomic_set(&cur_trans->use_count, 1); - cur_trans->commit_done = 0; - cur_trans->start_time = get_seconds(); - - cur_trans->delayed_refs.root = RB_ROOT; - cur_trans->delayed_refs.num_entries = 0; - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - spin_lock_init(&cur_trans->delayed_refs.lock); - - INIT_LIST_HEAD(&cur_trans->pending_snapshots); - list_add_tail(&cur_trans->list, &root->fs_info->trans_list); - extent_io_tree_init(&cur_trans->dirty_pages, - root->fs_info->btree_inode->i_mapping, - GFP_NOFS); - spin_lock(&root->fs_info->new_trans_lock); - root->fs_info->running_transaction = cur_trans; - spin_unlock(&root->fs_info->new_trans_lock); - } else { + if (cur_trans) { + atomic_inc(&cur_trans->use_count); + atomic_inc(&cur_trans->num_writers); + cur_trans->num_joined++; + spin_unlock(&root->fs_info->trans_lock); + return 0; + } + spin_unlock(&root->fs_info->trans_lock); + + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); + if (!cur_trans) + return -ENOMEM; + spin_lock(&root->fs_info->trans_lock); + if (root->fs_info->running_transaction) { + kmem_cache_free(btrfs_transaction_cachep, cur_trans); + cur_trans = root->fs_info->running_transaction; + atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; + spin_unlock(&root->fs_info->trans_lock); + return 0; } + atomic_set(&cur_trans->num_writers, 1); + cur_trans->num_joined = 0; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->blocked = 0; + /* + * One for this trans handle, one so it will live on until we + * commit the transaction. + */ + atomic_set(&cur_trans->use_count, 2); + cur_trans->commit_done = 0; + cur_trans->start_time = get_seconds(); + + cur_trans->delayed_refs.root = RB_ROOT; + cur_trans->delayed_refs.num_entries = 0; + cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.num_heads = 0; + cur_trans->delayed_refs.flushing = 0; + cur_trans->delayed_refs.run_delayed_start = 0; + spin_lock_init(&cur_trans->commit_lock); + spin_lock_init(&cur_trans->delayed_refs.lock); + + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping, + GFP_NOFS); + root->fs_info->generation++; + cur_trans->transid = root->fs_info->generation; + root->fs_info->running_transaction = cur_trans; + spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -99,39 +126,28 @@ static noinline int join_transaction(struct btrfs_root *root) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + spin_lock(&root->fs_info->fs_roots_radix_lock); + if (root->last_trans == trans->transid) { + spin_unlock(&root->fs_info->fs_roots_radix_lock); + return 0; + } + root->last_trans = trans->transid; radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); - root->last_trans = trans->transid; + spin_unlock(&root->fs_info->fs_roots_radix_lock); btrfs_init_reloc_root(trans, root); } return 0; } -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) -{ - if (!root->ref_cows) - return 0; - - mutex_lock(&root->fs_info->trans_mutex); - if (root->last_trans == trans->transid) { - mutex_unlock(&root->fs_info->trans_mutex); - return 0; - } - - record_root_in_trans(trans, root); - mutex_unlock(&root->fs_info->trans_mutex); - return 0; -} - /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -140,21 +156,23 @@ static void wait_current_trans(struct btrfs_root *root) { struct btrfs_transaction *cur_trans; + spin_lock(&root->fs_info->trans_lock); cur_trans = root->fs_info->running_transaction; if (cur_trans && cur_trans->blocked) { DEFINE_WAIT(wait); atomic_inc(&cur_trans->use_count); + spin_unlock(&root->fs_info->trans_lock); while (1) { prepare_to_wait(&root->fs_info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); if (!cur_trans->blocked) break; - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); } finish_wait(&root->fs_info->transaction_wait, &wait); put_transaction(cur_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); } } @@ -167,10 +185,16 @@ enum btrfs_trans_type { static int may_wait_transaction(struct btrfs_root *root, int type) { - if (!root->fs_info->log_root_recovering && - ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || - type == TRANS_USERSPACE)) + if (root->fs_info->log_root_recovering) + return 0; + + if (type == TRANS_USERSPACE) + return 1; + + if (type == TRANS_START && + !atomic_read(&root->fs_info->open_ioctl_trans)) return 1; + return 0; } @@ -198,23 +222,21 @@ again: if (!h) return ERR_PTR(-ENOMEM); - if (type != TRANS_JOIN_NOLOCK) - mutex_lock(&root->fs_info->trans_mutex); if (may_wait_transaction(root, type)) wait_current_trans(root); - ret = join_transaction(root); + do { + ret = join_transaction(root, type == TRANS_JOIN_NOLOCK); + if (ret == -EBUSY) + wait_current_trans(root); + } while (ret == -EBUSY); + if (ret < 0) { kmem_cache_free(btrfs_trans_handle_cachep, h); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); return ERR_PTR(ret); } cur_trans = root->fs_info->running_transaction; - atomic_inc(&cur_trans->use_count); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); h->transid = cur_trans->transid; h->transaction = cur_trans; @@ -253,11 +275,7 @@ again: } got_it: - if (type != TRANS_JOIN_NOLOCK) - mutex_lock(&root->fs_info->trans_mutex); - record_root_in_trans(h, root); - if (type != TRANS_JOIN_NOLOCK) - mutex_unlock(&root->fs_info->trans_mutex); + btrfs_record_root_in_trans(h, root); if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; @@ -289,17 +307,13 @@ static noinline int wait_for_commit(struct btrfs_root *root, struct btrfs_transaction *commit) { DEFINE_WAIT(wait); - mutex_lock(&root->fs_info->trans_mutex); while (!commit->commit_done) { prepare_to_wait(&commit->commit_wait, &wait, TASK_UNINTERRUPTIBLE); if (commit->commit_done) break; - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); } - mutex_unlock(&root->fs_info->trans_mutex); finish_wait(&commit->commit_wait, &wait); return 0; } @@ -309,50 +323,49 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) struct btrfs_transaction *cur_trans = NULL, *t; int ret; - mutex_lock(&root->fs_info->trans_mutex); - ret = 0; if (transid) { if (transid <= root->fs_info->last_trans_committed) - goto out_unlock; + goto out; /* find specified transaction */ + spin_lock(&root->fs_info->trans_lock); list_for_each_entry(t, &root->fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; + atomic_inc(&cur_trans->use_count); break; } if (t->transid > transid) break; } + spin_unlock(&root->fs_info->trans_lock); ret = -EINVAL; if (!cur_trans) - goto out_unlock; /* bad transid */ + goto out; /* bad transid */ } else { /* find newest transaction that is committing | committed */ + spin_lock(&root->fs_info->trans_lock); list_for_each_entry_reverse(t, &root->fs_info->trans_list, list) { if (t->in_commit) { if (t->commit_done) - goto out_unlock; + goto out; cur_trans = t; + atomic_inc(&cur_trans->use_count); break; } } + spin_unlock(&root->fs_info->trans_lock); if (!cur_trans) - goto out_unlock; /* nothing committing|committed */ + goto out; /* nothing committing|committed */ } - atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); - wait_for_commit(root, cur_trans); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(cur_trans); ret = 0; -out_unlock: - mutex_unlock(&root->fs_info->trans_mutex); +out: return ret; } @@ -401,10 +414,8 @@ harder: void btrfs_throttle(struct btrfs_root *root) { - mutex_lock(&root->fs_info->trans_mutex); - if (!root->fs_info->open_ioctl_trans) + if (!atomic_read(&root->fs_info->open_ioctl_trans)) wait_current_trans(root); - mutex_unlock(&root->fs_info->trans_mutex); } static int should_end_transaction(struct btrfs_trans_handle *trans, @@ -422,6 +433,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; int updates; + smp_mb(); if (cur_trans->blocked || cur_trans->delayed_refs.flushing) return 1; @@ -467,9 +479,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); - if (lock && !root->fs_info->open_ioctl_trans && - should_end_transaction(trans, root)) + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && + should_end_transaction(trans, root)) { trans->transaction->blocked = 1; + smp_wmb(); + } if (lock && cur_trans->blocked && !cur_trans->in_commit) { if (throttle) @@ -739,9 +753,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, */ int btrfs_add_dead_root(struct btrfs_root *root) { - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->trans_lock); list_add(&root->root_list, &root->fs_info->dead_roots); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); return 0; } @@ -757,6 +771,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, int ret; int err = 0; + spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, (void **)gang, 0, @@ -769,6 +784,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); @@ -783,10 +799,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, err = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); + spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; } } + spin_unlock(&fs_info->fs_roots_radix_lock); return err; } @@ -972,7 +990,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent = dget_parent(dentry); parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root); + btrfs_record_root_in_trans(trans, parent_root); /* * insert the directory item @@ -990,7 +1008,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - record_root_in_trans(trans, root); + btrfs_record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1080,20 +1098,20 @@ static void update_super_roots(struct btrfs_root *root) int btrfs_transaction_in_commit(struct btrfs_fs_info *info) { int ret = 0; - spin_lock(&info->new_trans_lock); + spin_lock(&info->trans_lock); if (info->running_transaction) ret = info->running_transaction->in_commit; - spin_unlock(&info->new_trans_lock); + spin_unlock(&info->trans_lock); return ret; } int btrfs_transaction_blocked(struct btrfs_fs_info *info) { int ret = 0; - spin_lock(&info->new_trans_lock); + spin_lock(&info->trans_lock); if (info->running_transaction) ret = info->running_transaction->blocked; - spin_unlock(&info->new_trans_lock); + spin_unlock(&info->trans_lock); return ret; } @@ -1117,9 +1135,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root, &wait); break; } - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&root->fs_info->transaction_blocked_wait, &wait); } } @@ -1145,9 +1161,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, &wait); break; } - mutex_unlock(&root->fs_info->trans_mutex); schedule(); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&root->fs_info->transaction_wait, &wait); } @@ -1193,22 +1207,18 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, } /* take transaction reference */ - mutex_lock(&root->fs_info->trans_mutex); cur_trans = trans->transaction; atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); schedule_delayed_work(&ac->work, 0); /* wait for transaction to start and unblock */ - mutex_lock(&root->fs_info->trans_mutex); if (wait_for_unblock) wait_current_trans_commit_start_and_unblock(root, cur_trans); else wait_current_trans_commit_start(root, cur_trans); put_transaction(cur_trans); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } @@ -1252,38 +1262,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&cur_trans->commit_lock); if (cur_trans->in_commit) { + spin_unlock(&cur_trans->commit_lock); atomic_inc(&cur_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); btrfs_end_transaction(trans, root); ret = wait_for_commit(root, cur_trans); BUG_ON(ret); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(cur_trans); - mutex_unlock(&root->fs_info->trans_mutex); return 0; } trans->transaction->in_commit = 1; trans->transaction->blocked = 1; + spin_unlock(&cur_trans->commit_lock); wake_up(&root->fs_info->transaction_blocked_wait); + spin_lock(&root->fs_info->trans_lock); if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (!prev_trans->commit_done) { atomic_inc(&prev_trans->use_count); - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->trans_lock); wait_for_commit(root, prev_trans); - mutex_lock(&root->fs_info->trans_mutex); put_transaction(prev_trans); + } else { + spin_unlock(&root->fs_info->trans_lock); } + } else { + spin_unlock(&root->fs_info->trans_lock); } if (now < cur_trans->start_time || now - cur_trans->start_time < 1) @@ -1291,12 +1304,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, do { int snap_pending = 0; + joined = cur_trans->num_joined; if (!list_empty(&trans->transaction->pending_snapshots)) snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit || snap_pending) { btrfs_start_delalloc_inodes(root, 1); @@ -1316,14 +1329,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, prepare_to_wait(&cur_trans->writer_wait, &wait, TASK_UNINTERRUPTIBLE); - smp_mb(); if (atomic_read(&cur_trans->num_writers) > 1) schedule_timeout(MAX_SCHEDULE_TIMEOUT); else if (should_grow) schedule_timeout(1); - mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); @@ -1364,9 +1378,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; - spin_lock(&root->fs_info->new_trans_lock); - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->new_trans_lock); btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); @@ -1387,10 +1398,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, sizeof(root->fs_info->super_copy)); trans->transaction->blocked = 0; + spin_lock(&root->fs_info->trans_lock); + root->fs_info->running_transaction = NULL; + root->fs_info->trans_no_join = 0; + spin_unlock(&root->fs_info->trans_lock); wake_up(&root->fs_info->transaction_wait); - mutex_unlock(&root->fs_info->trans_mutex); ret = btrfs_write_and_wait_transaction(trans, root); BUG_ON(ret); write_ctree_super(trans, root, 0); @@ -1403,22 +1417,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - mutex_lock(&root->fs_info->trans_mutex); - cur_trans->commit_done = 1; root->fs_info->last_trans_committed = cur_trans->transid; wake_up(&cur_trans->commit_wait); + spin_lock(&root->fs_info->trans_lock); list_del_init(&cur_trans->list); + spin_unlock(&root->fs_info->trans_lock); + put_transaction(cur_trans); put_transaction(cur_trans); trace_btrfs_transaction_commit(root); - mutex_unlock(&root->fs_info->trans_mutex); - if (current->journal_info == trans) current->journal_info = NULL; @@ -1438,9 +1451,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - mutex_lock(&fs_info->trans_mutex); + spin_lock(&fs_info->trans_lock); list_splice_init(&fs_info->dead_roots, &list); - mutex_unlock(&fs_info->trans_mutex); + spin_unlock(&fs_info->trans_lock); while (!list_empty(&list)) { root = list_entry(list.next, struct btrfs_root, root_list); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 154314f80f8d..11c6efcd4ed2 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -28,10 +28,12 @@ struct btrfs_transaction { * transaction can end */ atomic_t num_writers; + atomic_t use_count; unsigned long num_joined; + + spinlock_t commit_lock; int in_commit; - atomic_t use_count; int commit_done; int blocked; struct list_head list; -- cgit v1.2.3 From fcb80c2affd63237cff5b34cba5756be7c976a5a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 3 May 2011 10:40:22 -0400 Subject: Btrfs: fix how we do space reservation for truncate The ceph guys keep running into problems where we have space reserved in our orphan block rsv when freeing it up. This is because they tend to do snapshots alot, so their truncates tend to use a bunch of space, so when we go to do things like update the inode we have to steal reservation space in order to make the reservation happen. This happens because truncate can use as much space as it freaking feels like, but we still have to hold space for removing the orphan item and updating the inode, which will definitely always happen. So in order to fix this we need to split all of the reservation stuf up. So with this patch we have 1) The orphan block reserve which only holds the space for deleting our orphan item when everything is over. 2) The truncate block reserve which gets allocated and used specifically for the space that the truncate will use on a per truncate basis. 3) The transaction will always have 1 item's worth of data reserved so we can update the inode normally. Hopefully this will make the ceph problem go away. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 ++ fs/btrfs/extent-tree.c | 46 +++++++++++++++----- fs/btrfs/inode.c | 111 +++++++++++++++++++++++++++++++++++++------------ 3 files changed, 123 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 522a39b0033d..f31aed7fedd9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2224,6 +2224,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, void btrfs_block_rsv_release(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes); +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv); int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ca599654ce19..a2ca561c70f0 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3980,6 +3980,37 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) 3 * num_items; } +int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv; + u64 num_bytes; + int ret; + + /* + * Truncate should be freeing data, but give us 2 items just in case it + * needs to use some space. We may want to be smarter about this in the + * future. + */ + num_bytes = calc_trans_metadata_size(root, 2); + + /* We already have enough bytes, just return */ + if (rsv->reserved >= num_bytes) + return 0; + + num_bytes -= rsv->reserved; + + /* + * You should have reserved enough space before hand to do this, so this + * should not fail. + */ + ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes); + BUG_ON(ret); + + return 0; +} + int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, int num_items) @@ -4020,23 +4051,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; /* - * one for deleting orphan item, one for updating inode and - * two for calling btrfs_truncate_inode_items. - * - * btrfs_truncate_inode_items is a delete operation, it frees - * more space than it uses in most cases. So two units of - * metadata space should be enough for calling it many times. - * If all of the metadata space is used, we can commit - * transaction and use space it freed. + * We need to hold space in order to delete our orphan item once we've + * added it, so this takes the reservation so we can release it later + * when we are truly done with the orphan item. */ - u64 num_bytes = calc_trans_metadata_size(root, 4); + u64 num_bytes = calc_trans_metadata_size(root, 1); return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); } void btrfs_orphan_release_metadata(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes = calc_trans_metadata_size(root, 4); + u64 num_bytes = calc_trans_metadata_size(root, 1); btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e47bdf0fb75a..bc12ba23db5f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6591,6 +6591,7 @@ out: static int btrfs_truncate(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *rsv; int ret; int err = 0; struct btrfs_trans_handle *trans; @@ -6604,28 +6605,83 @@ static int btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); + /* + * Yes ladies and gentelment, this is indeed ugly. The fact is we have + * 3 things going on here + * + * 1) We need to reserve space for our orphan item and the space to + * delete our orphan item. Lord knows we don't want to have a dangling + * orphan item because we didn't reserve space to remove it. + * + * 2) We need to reserve space to update our inode. + * + * 3) We need to have something to cache all the space that is going to + * be free'd up by the truncate operation, but also have some slack + * space reserved in case it uses space during the truncate (thank you + * very much snapshotting). + * + * And we need these to all be seperate. The fact is we can use alot of + * space doing the truncate, and we have no earthly idea how much space + * we will use, so we need the truncate reservation to be seperate so it + * doesn't end up using space reserved for updating the inode or + * removing the orphan item. We also need to be able to stop the + * transaction and start a new one, which means we need to be able to + * update the inode several times, and we have no idea of knowing how + * many times that will be, so we can't just reserve 1 item for the + * entirety of the opration, so that has to be done seperately as well. + * Then there is the orphan item, which does indeed need to be held on + * to for the whole operation, and we need nobody to touch this reserved + * space except the orphan code. + * + * So that leaves us with + * + * 1) root->orphan_block_rsv - for the orphan deletion. + * 2) rsv - for the truncate reservation, which we will steal from the + * transaction reservation. + * 3) fs_info->trans_block_rsv - this will have 1 items worth left for + * updating the inode. + */ + rsv = btrfs_alloc_block_rsv(root); + if (!rsv) + return -ENOMEM; + btrfs_add_durable_block_rsv(root->fs_info, rsv); + + trans = btrfs_start_transaction(root, 4); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } btrfs_set_trans_block_group(trans, inode); + /* + * Reserve space for the truncate process. Truncate should be adding + * space, but if there are snapshots it may end up using space. + */ + ret = btrfs_truncate_reserve_metadata(trans, root, rsv); + BUG_ON(ret); + ret = btrfs_orphan_add(trans, inode); if (ret) { btrfs_end_transaction(trans, root); - return ret; + goto out; } nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); - /* Now start a transaction for the truncate */ - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); + /* + * Ok so we've already migrated our bytes over for the truncate, so here + * just reserve the one slot we need for updating the inode. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; + trans->block_rsv = rsv; /* * setattr is responsible for setting the ordered_data_close flag, @@ -6649,24 +6705,18 @@ static int btrfs_truncate(struct inode *inode) while (1) { if (!trans) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) - return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); - trans->block_rsv = root->orphan_block_rsv; - } + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } - ret = btrfs_block_rsv_check(trans, root, - root->orphan_block_rsv, 0, 5); - if (ret == -EAGAIN) { - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - trans = NULL; - continue; - } else if (ret) { - err = ret; - break; + ret = btrfs_truncate_reserve_metadata(trans, root, + rsv); + BUG_ON(ret); + + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = rsv; } ret = btrfs_truncate_inode_items(trans, root, inode, @@ -6677,6 +6727,7 @@ static int btrfs_truncate(struct inode *inode) break; } + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret) { err = ret; @@ -6690,6 +6741,7 @@ static int btrfs_truncate(struct inode *inode) } if (ret == 0 && inode->i_nlink > 0) { + trans->block_rsv = root->orphan_block_rsv; ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; @@ -6701,15 +6753,20 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(NULL, inode); } + trans->block_rsv = &root->fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret && !err) err = ret; nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + +out: + btrfs_free_block_rsv(root, rsv); + if (ret && !err) err = ret; - btrfs_btree_balance_dirty(root, nr); return err; } -- cgit v1.2.3 From af60bed24eb0e3b6d93eaa6bb395a5721e6c09a8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 May 2011 11:11:17 -0400 Subject: Btrfs: set range_start to the right start in count_range_bits In count_range_bits we are adjusting total_bytes based on the range we are searching for, but we don't adjust the range start according to the range we are searching for, which makes for weird results. For example, if the range [0-8192] is set DELALLOC, but I search for 4096-8192, I will get back 4096 for the number of bytes found, but the range_start will be 0, which makes it look like the range is [0-4096]. So instead set range_start = max(cur_start, state->start). This makes everything come out right. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ba41da59e31b..b5f6f227a97c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1480,7 +1480,7 @@ u64 count_range_bits(struct extent_io_tree *tree, if (total_bytes >= max_bytes) break; if (!found) { - *start = state->start; + *start = max(cur_start, state->start); found = 1; } last = state->end; -- cgit v1.2.3 From cb25c2ea6a79702ab7895b873c6c43e0d3bc3c72 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 12:17:34 -0400 Subject: Btrfs: map the node block when looking for readahead targets If we have particularly full nodes, we could call btrfs_node_blockptr up to 32 times, which is 32 pairs of kmap/kunmap, which _sucks_. So go ahead and map the extent buffer while we look for readahead targets. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 84d7ca1fe0ba..009bcf7f1e4b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1229,6 +1229,7 @@ static void reada_for_search(struct btrfs_root *root, u64 search; u64 target; u64 nread = 0; + u64 gen; int direction = path->reada; struct extent_buffer *eb; u32 nr; @@ -1256,6 +1257,15 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; while (1) { + if (!node->map_token) { + unsigned long offset = btrfs_node_key_ptr_offset(nr); + map_private_extent_buffer(node, offset, + sizeof(struct btrfs_key_ptr), + &node->map_token, + &node->kaddr, + &node->map_start, + &node->map_len, KM_USER1); + } if (direction < 0) { if (nr == 0) break; @@ -1273,14 +1283,23 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, nr); if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { - readahead_tree_block(root, search, blocksize, - btrfs_node_ptr_generation(node, nr)); + gen = btrfs_node_ptr_generation(node, nr); + if (node->map_token) { + unmap_extent_buffer(node, node->map_token, + KM_USER1); + node->map_token = NULL; + } + readahead_tree_block(root, search, blocksize, gen); nread += blocksize; } nscan++; if ((nread > 65536 || nscan > 32)) break; } + if (node->map_token) { + unmap_extent_buffer(node, node->map_token, KM_USER1); + node->map_token = NULL; + } } /* -- cgit v1.2.3 From 7e2355ba1a11649f0b212a29fdb9f47476f1248e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 12:25:37 -0400 Subject: Btrfs: don't look at the extent buffer level 3 times in a row We have a bit of debugging in btrfs_search_slot to make sure the level of the cow block is the same as the original block we were cow'ing. I don't think I've ever seen this tripped, so kill it. This saves us 2 kmap's per level in our search. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 009bcf7f1e4b..f7a0a64b868f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1672,9 +1672,6 @@ again: } cow_done: BUG_ON(!cow && ins_len); - if (level != btrfs_header_level(b)) - WARN_ON(1); - level = btrfs_header_level(b); p->nodes[level] = b; if (!p->skip_locking) -- cgit v1.2.3 From d82a6f1d7e8b61ed5996334d0db66651bb43641d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 15:26:06 -0400 Subject: Btrfs: kill BTRFS_I(inode)->block_group Originally this was going to be used as a way to give hints to the allocator, but frankly we can get much better hints elsewhere and it's not even used at all for anything usefull. In addition to be completely useless, when we initialize an inode we try and find a freeish block group to set as the inodes block group, and with a completely full 40gb fs this takes _forever_, so I imagine with say 1tb fs this is just unbearable. So just axe the thing altoghether, we don't need it and it saves us 8 bytes in the inode and saves us 500 microseconds per inode lookup in my testcase. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 3 -- fs/btrfs/ctree.h | 3 +- fs/btrfs/extent-tree.c | 10 ++---- fs/btrfs/inode.c | 87 ++++++-------------------------------------------- fs/btrfs/ioctl.c | 3 +- fs/btrfs/transaction.c | 1 - fs/btrfs/transaction.h | 14 -------- fs/btrfs/xattr.c | 2 -- 8 files changed, 13 insertions(+), 110 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 57c3bb2884ce..4bc852d3b83d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -120,9 +120,6 @@ struct btrfs_inode { */ u64 index_cnt; - /* the start of block group preferred for allocations. */ - u64 block_group; - /* the fsync log has some corner cases that mean we have to check * directories to see if any unlinks have been done before * the directory was logged. See tree-log.c for all the diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f31aed7fedd9..0f8c489bcc02 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2512,8 +2512,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint); + struct btrfs_root *new_root, u64 new_dirid); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a2ca561c70f0..9f0a4e3bd8a9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5319,6 +5319,7 @@ checks: btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); + btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; @@ -5411,14 +5412,7 @@ loop: ret = -ENOSPC; } else if (!ins->objectid) { ret = -ENOSPC; - } - - /* we found what we needed */ - if (ins->objectid) { - if (!(data & BTRFS_BLOCK_GROUP_DATA)) - trans->block_group = block_group->key.objectid; - - btrfs_put_block_group(block_group); + } else if (ins->objectid) { ret = 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bc12ba23db5f..dd5938a7de21 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -136,7 +136,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; - btrfs_set_trans_block_group(trans, inode); key.objectid = inode->i_ino; key.offset = start; @@ -422,7 +421,6 @@ again: if (start == 0) { trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ @@ -781,7 +779,6 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(root == root->fs_info->tree_root); trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; num_bytes = (end - start + blocksize) & ~(blocksize - 1); @@ -1502,8 +1499,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, { struct btrfs_ordered_sum *sum; - btrfs_set_trans_block_group(trans, inode); - list_for_each_entry(sum, list, list) { btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); @@ -1722,7 +1717,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) else trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -1739,7 +1733,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) else trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) @@ -2495,7 +2488,6 @@ static void btrfs_read_locked_inode(struct inode *inode) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; int maybe_acls; - u64 alloc_group_block; u32 rdev; int ret; @@ -2539,8 +2531,6 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - alloc_group_block = btrfs_inode_block_group(leaf, inode_item); - /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls @@ -2549,8 +2539,6 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!maybe_acls) cache_no_acl(inode); - BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, - alloc_group_block, 0); btrfs_free_path(path); inode_item = NULL; @@ -2630,7 +2618,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_inode_transid(leaf, item, trans->transid); btrfs_set_inode_rdev(leaf, item, inode->i_rdev); btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); - btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); + btrfs_set_inode_block_group(leaf, item, 0); if (leaf->map_token) { unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); @@ -2971,8 +2959,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, @@ -3068,8 +3054,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, root, dir, BTRFS_I(inode)->location.objectid, @@ -3649,7 +3633,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) err = PTR_ERR(trans); break; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_drop_extents(trans, inode, cur_offset, cur_offset + hole_size, @@ -3785,7 +3768,6 @@ void btrfs_evict_inode(struct inode *inode) while (1) { trans = btrfs_start_transaction(root, 0); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = root->orphan_block_rsv; ret = btrfs_block_rsv_check(trans, root, @@ -4383,7 +4365,6 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); if (nolock) ret = btrfs_end_transaction_nolock(trans, root); else @@ -4409,7 +4390,6 @@ void btrfs_dirty_inode(struct inode *inode) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); - btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); if (ret && ret == -ENOSPC) { @@ -4424,7 +4404,6 @@ void btrfs_dirty_inode(struct inode *inode) } return; } - btrfs_set_trans_block_group(trans, inode); ret = btrfs_update_inode(trans, root, inode); if (ret) { @@ -4519,8 +4498,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, const char *name, int name_len, - u64 ref_objectid, u64 objectid, - u64 alloc_hint, int mode, u64 *index) + u64 ref_objectid, u64 objectid, int mode, + u64 *index) { struct inode *inode; struct btrfs_inode_item *inode_item; @@ -4567,8 +4546,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 0; else owner = 1; - BTRFS_I(inode)->block_group = - btrfs_find_block_group(root, 0, alloc_hint, owner); key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); @@ -4729,11 +4706,9 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, - BTRFS_I(dir)->block_group, mode, &index); + mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -4745,7 +4720,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -4754,8 +4728,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); @@ -4791,11 +4763,9 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, - BTRFS_I(dir)->block_group, mode, &index); + mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -4807,7 +4777,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -4818,8 +4787,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); @@ -4866,8 +4833,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_inc_nlink(inode); inode->i_ctime = CURRENT_TIME; - - btrfs_set_trans_block_group(trans, dir); ihold(inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); @@ -4876,7 +4841,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dget_parent(dentry); - btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); BUG_ON(err); btrfs_log_new_name(trans, inode, NULL, parent); @@ -4917,12 +4881,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, - BTRFS_I(dir)->block_group, S_IFDIR | mode, - &index); + S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_fail; @@ -4936,7 +4898,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - btrfs_set_trans_block_group(trans, inode); btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); @@ -4950,8 +4911,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); drop_on_err = 0; - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); out_fail: nr = trans->blocks_used; @@ -6652,8 +6611,6 @@ static int btrfs_truncate(struct inode *inode) goto out; } - btrfs_set_trans_block_group(trans, inode); - /* * Reserve space for the truncate process. Truncate should be adding * space, but if there are snapshots it may end up using space. @@ -6680,7 +6637,6 @@ static int btrfs_truncate(struct inode *inode) err = PTR_ERR(trans); goto out; } - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = rsv; /* @@ -6715,7 +6671,6 @@ static int btrfs_truncate(struct inode *inode) rsv); BUG_ON(ret); - btrfs_set_trans_block_group(trans, inode); trans->block_rsv = rsv; } @@ -6775,15 +6730,14 @@ out: * create a new subvolume directory/inode (helper for the ioctl). */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - u64 new_dirid, u64 alloc_hint) + struct btrfs_root *new_root, u64 new_dirid) { struct inode *inode; int err; u64 index = 0; inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, - new_dirid, alloc_hint, S_IFDIR | 0700, &index); + new_dirid, S_IFDIR | 0700, &index); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = &btrfs_dir_inode_operations; @@ -6893,21 +6847,6 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - if (root == root->fs_info->tree_root) { - struct btrfs_block_group_cache *block_group; - - block_group = btrfs_lookup_block_group(root->fs_info, - BTRFS_I(inode)->block_group); - if (block_group && block_group->inode == inode) { - spin_lock(&block_group->lock); - block_group->inode = NULL; - spin_unlock(&block_group->lock); - btrfs_put_block_group(block_group); - } else if (block_group) { - btrfs_put_block_group(block_group); - } - } - spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", @@ -7091,8 +7030,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_notrans; } - btrfs_set_trans_block_group(trans, new_dir); - if (dest != root) btrfs_record_root_in_trans(trans, dest); @@ -7331,12 +7268,9 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, dir); - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dir->i_ino, objectid, - BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, - &index); + S_IFLNK|S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto out_unlock; @@ -7348,7 +7282,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } - btrfs_set_trans_block_group(trans, inode); err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; @@ -7359,8 +7292,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - btrfs_update_inode_block_group(trans, inode); - btrfs_update_inode_block_group(trans, dir); if (drop_inode) goto out_unlock; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a578620e06a8..8e90ccf4b76a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -413,8 +413,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_record_root_in_trans(trans, new_root); - ret = btrfs_create_subvol_root(trans, new_root, new_dirid, - BTRFS_I(dir)->block_group); + ret = btrfs_create_subvol_root(trans, new_root, new_dirid); /* * insert the directory item */ diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 43816f8b23e7..f4ea695325b2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -241,7 +241,6 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; h->blocks_used = 0; - h->block_group = 0; h->bytes_reserved = 0; h->delayed_ref_updates = 0; h->use_count = 1; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 11c6efcd4ed2..da7289e06a82 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -47,7 +47,6 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; - u64 block_group; u64 bytes_reserved; unsigned long use_count; unsigned long blocks_reserved; @@ -70,19 +69,6 @@ struct btrfs_pending_snapshot { struct list_head list; }; -static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, - struct inode *inode) -{ - trans->block_group = BTRFS_I(inode)->block_group; -} - -static inline void btrfs_update_inode_block_group( - struct btrfs_trans_handle *trans, - struct inode *inode) -{ - BTRFS_I(inode)->block_group = trans->block_group; -} - static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index cfd660550ded..72ab0295ca74 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_set_trans_block_group(trans, inode); - ret = do_setxattr(trans, inode, name, value, size, flags); if (ret) goto out; -- cgit v1.2.3 From 589d8ade83f07c0f11c8191c0ca309f34d7a2c14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 11 May 2011 17:30:53 -0400 Subject: Btrfs: try not to sleep as much when doing slow caching When the fs is super full and we unmount the fs, we could get stuck in this thing where unmount is waiting for the caching kthread to make progress and the caching kthread keeps scheduling because we're in the middle of a commit. So instead just let the caching kthread keep going and only yeild if need_resched(). This makes my horrible umount case go from taking up to 10 minutes to taking less than 20 seconds. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9f0a4e3bd8a9..96be62450318 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -378,15 +378,18 @@ again: if (ret) break; - caching_ctl->progress = last; - btrfs_release_path(extent_root, path); - up_read(&fs_info->extent_commit_sem); - mutex_unlock(&caching_ctl->mutex); - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); - else + if (need_resched() || + btrfs_next_leaf(extent_root, path)) { + caching_ctl->progress = last; + btrfs_release_path(extent_root, path); + up_read(&fs_info->extent_commit_sem); + mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + goto again; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + continue; } if (key.objectid < block_group->key.objectid) { -- cgit v1.2.3 From 026fd317828500524cdc7e5ff9e8e7923abb2868 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 May 2011 10:32:11 -0400 Subject: Btrfs: don't always do readahead Our readahead is sort of sloppy, and really isn't always needed. For example if ls is doing a stating ls (which is the default) it's going to stat in non-disk order, so if say you have a directory with a stupid amount of files, readahead is going to do nothing but waste time in the case of doing the stat. Taking the unconditional readahead out made my test go from 57 minutes to 36 minutes. This means that everywhere we do loop through the tree we want to make sure we do set path->reada properly, so I went through and found all of the places where we loop through the path and set reada to 1. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 2 -- fs/btrfs/extent-tree.c | 3 ++- fs/btrfs/inode.c | 14 ++++++++++++-- fs/btrfs/relocation.c | 6 ++++++ 4 files changed, 20 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f7a0a64b868f..f61c16c1481a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -48,8 +48,6 @@ struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); - if (path) - path->reada = 1; return path; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 96be62450318..1ba2cc58eab5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -347,7 +347,7 @@ static int caching_kthread(void *data) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = 1; key.objectid = last; key.offset = 0; @@ -8556,6 +8556,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); if (cache_gen != 0 && diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd5938a7de21..6228a304b547 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4242,7 +4242,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, filp->f_pos = 2; } path = btrfs_alloc_path(); - path->reada = 2; + if (!path) + return -ENOMEM; + path->reada = 1; btrfs_set_key_type(&key, key_type); key.offset = filp->f_pos; @@ -5043,7 +5045,15 @@ again: if (!path) { path = btrfs_alloc_path(); - BUG_ON(!path); + if (!path) { + err = -ENOMEM; + goto out; + } + /* + * Chances are we'll be called again, so go ahead and do + * readahead + */ + path->reada = 1; } ret = btrfs_lookup_file_extent(trans, root, path, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 09c30d37d43e..5872b41581f4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -676,6 +676,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } + path1->reada = 1; + path2->reada = 2; node = alloc_backref_node(cache); if (!node) { @@ -1996,6 +1998,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -3297,6 +3300,7 @@ static int find_data_references(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { @@ -3665,6 +3669,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = 1; ret = prepare_to_relocate(rc); if (ret) { @@ -4090,6 +4095,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; + path->reada = -1; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; -- cgit v1.2.3 From cca1c81f43e26ab60c0d1090fb90992358d69bdf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 May 2011 11:07:12 -0400 Subject: Btrfs: don't try to allocate from a block group that doesn't have enough space If we have a very large filesystem, we can spend a lot of time in find_free_extent just trying to allocate from empty block groups. So instead check to see if the block group even has enough space for the allocation, and if not go on to the next block group. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1ba2cc58eab5..c8c318494dee 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5159,6 +5159,14 @@ have_block_group: if (unlikely(block_group->ro)) goto loop; + spin_lock(&block_group->tree_lock); + if (cached && + block_group->free_space < num_bytes + empty_size) { + spin_unlock(&block_group->tree_lock); + goto loop; + } + spin_unlock(&block_group->tree_lock); + /* * Ok we want to try and use the cluster allocator, so lets look * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will -- cgit v1.2.3 From 207dde8289d9b005b665cb9d8d2bb9464256101d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 13 May 2011 14:49:23 -0400 Subject: Btrfs: check for duplicate entries in the free space cache If there are duplicate entries in the free space cache, discard the entire cache and load it the old fashioned way. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 63731a1fb0a1..d634a7e42207 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -420,7 +420,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, spin_lock(&block_group->tree_lock); ret = link_free_space(block_group, e); spin_unlock(&block_group->tree_lock); - BUG_ON(ret); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } } else { e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); if (!e->bitmap) { @@ -437,6 +444,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, recalculate_thresholds(block_group); spin_unlock(&block_group->tree_lock); list_add_tail(&e->list, &bitmaps); + if (ret) { + printk(KERN_ERR "Duplicate entries in " + "free space cache, dumping\n"); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } } num_entries--; @@ -909,10 +924,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset, * logically. */ if (bitmap) { - WARN_ON(info->bitmap); + if (info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } p = &(*p)->rb_right; } else { - WARN_ON(!info->bitmap); + if (!info->bitmap) { + WARN_ON_ONCE(1); + return -EEXIST; + } p = &(*p)->rb_left; } } -- cgit v1.2.3 From d90c732122a1f6d0efe388a8a204f67f144b2eb3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 17 May 2011 09:50:54 -0400 Subject: Btrfs: leave spinning on lookup and map the leaf On lookup we only want to read the inode item, so leave the path spinning. Also we're just wholesale reading the leaf off, so map the leaf so we don't do a bunch of kmap/kunmaps. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6228a304b547..dc8fb2b3a145 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2493,6 +2493,7 @@ static void btrfs_read_locked_inode(struct inode *inode) path = btrfs_alloc_path(); BUG_ON(!path); + path->leave_spinning = 1; memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); @@ -2502,6 +2503,12 @@ static void btrfs_read_locked_inode(struct inode *inode) leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + if (!leaf->map_token) + map_private_extent_buffer(leaf, (unsigned long)inode_item, + sizeof(struct btrfs_inode_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); inode->i_mode = btrfs_inode_mode(leaf, inode_item); inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); @@ -2539,6 +2546,11 @@ static void btrfs_read_locked_inode(struct inode *inode) if (!maybe_acls) cache_no_acl(inode); + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + btrfs_free_path(path); inode_item = NULL; -- cgit v1.2.3 From f90e5b5b136ede1f0fd15999e95f13124d6b0dbd Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 24 May 2011 10:44:42 -0400 Subject: GFS2: Processes waiting on inode glock that no processes are holding This patch fixes a race in the GFS2 glock state machine that may result in lockups. The symptom is that all nodes but one will hang, waiting for a particular glock. All the holder records will have the "W" (Waiting) bit set. The other node will typically have the glock stuck in Exclusive mode (EX) with no holder records, but the dinode will be cached. In other words, an entry with "I:" will appear in the glock dump for that glock, but nothing else. The race has to do with the glock "Pending Demote" bit, which can be set, then immediately reset, thus losing the fact that another node needs the glock. The sequence of events is: 1. Something schedules the glock workqueue (e.g. glock request from fs) 2. The glock workqueue gets to the point between the test of the reply pending bit and the spin lock: if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { finish_xmote(gl, gl->gl_reply); drop_ref = 1; } down_read(&gfs2_umount_flush_sem); <---- i.e. here spin_lock(&gl->gl_spin); 3. In comes (a) the reply to our EX lock request setting GLF_REPLY_PENDING and (b) the demote request which sets GLF_PENDING_DEMOTE 4. The following test is executed: if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { This resets the pending demote flag, and gl->gl_demote_state is not equal to exclusive, however because the reply from the dlm arrived after we checked for the GLF_REPLY_PENDING flag, gl->gl_state is still equal to unlocked, so although we reset the GLF_PENDING_DEMOTE flag, we didn't then set the GLF_DEMOTE flag or reinstate the GLF_PENDING_DEMOTE_FLAG. The patch closes the timing window by only transitioning the "Pending demote" bit to the "demote" flag once we know the other conditions (not unlocked and not exclusive) are met. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a2a6abbccc07..7137750f17f0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -663,14 +663,19 @@ static void glock_work_func(struct work_struct *work) drop_ref = 1; } spin_lock(&gl->gl_spin); - if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { unsigned long holdtime, now = jiffies; + holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; - set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags); + + if (!delay) { + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + set_bit(GLF_DEMOTE, &gl->gl_flags); + } } run_queue(gl, 0); spin_unlock(&gl->gl_spin); -- cgit v1.2.3 From 1adffbae22332bb558c2a29de19d9aca391869f6 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 31 May 2011 19:38:07 +0900 Subject: fat: Fix corrupt inode flags when remove ATTR_SYS flag We are clearly missing '~' in fat_ioctl_set_attributes(). Cc: Reported-by: Dmitry Dmitriev Signed-off-by: OGAWA Hirofumi --- fs/fat/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fat/file.c b/fs/fat/file.c index 7257752b6d5d..7018e1d8902d 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -102,7 +102,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) if (attr & ATTR_SYS) inode->i_flags |= S_IMMUTABLE; else - inode->i_flags &= S_IMMUTABLE; + inode->i_flags &= ~S_IMMUTABLE; } fat_save_attrs(inode, attr); -- cgit v1.2.3 From fe47ae7f53e179d2ef6771024feb000cbb86640f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 31 May 2011 12:35:41 +0200 Subject: oprofile, dcookies: Fix possible circular locking dependency The lockdep warning below detects a possible A->B/B->A locking dependency of mm->mmap_sem and dcookie_mutex. The order in sync_buffer() is mm->mmap_sem/dcookie_mutex, while in sys_lookup_dcookie() it is vice versa. Fixing it in sys_lookup_dcookie() by unlocking dcookie_mutex before copy_to_user(). oprofiled/4432 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x53/0xa3 but task is already holding lock: (dcookie_mutex){+.+.+.}, at: [] sys_lookup_dcookie+0x45/0x149 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (dcookie_mutex){+.+.+.}: [] lock_acquire+0xf8/0x11e [] mutex_lock_nested+0x63/0x309 [] get_dcookie+0x30/0x144 [] sync_buffer+0x196/0x3ec [oprofile] [] task_exit_notify+0x16/0x1a [oprofile] [] notifier_call_chain+0x37/0x63 [] __blocking_notifier_call_chain+0x50/0x67 [] blocking_notifier_call_chain+0x14/0x16 [] profile_task_exit+0x1a/0x1c [] do_exit+0x2a/0x6fc [] do_group_exit+0x83/0xae [] sys_exit_group+0x17/0x1b [] system_call_fastpath+0x16/0x1b -> #0 (&mm->mmap_sem){++++++}: [] __lock_acquire+0x1085/0x1711 [] lock_acquire+0xf8/0x11e [] might_fault+0x80/0xa3 [] sys_lookup_dcookie+0x104/0x149 [] system_call_fastpath+0x16/0x1b other info that might help us debug this: 1 lock held by oprofiled/4432: #0: (dcookie_mutex){+.+.+.}, at: [] sys_lookup_dcookie+0x45/0x149 stack backtrace: Pid: 4432, comm: oprofiled Not tainted 2.6.39-00008-ge5a450d #9 Call Trace: [] print_circular_bug+0xae/0xbc [] __lock_acquire+0x1085/0x1711 [] ? get_parent_ip+0x11/0x42 [] ? might_fault+0x53/0xa3 [] lock_acquire+0xf8/0x11e [] ? might_fault+0x53/0xa3 [] ? path_put+0x22/0x27 [] might_fault+0x80/0xa3 [] ? might_fault+0x53/0xa3 [] sys_lookup_dcookie+0x104/0x149 [] system_call_fastpath+0x16/0x1b References: https://bugzilla.kernel.org/show_bug.cgi?id=13809 Cc: # .27+ Signed-off-by: Robert Richter --- fs/dcookies.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/dcookies.c b/fs/dcookies.c index a21cabdbd87b..dda0dc702d1b 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -178,6 +178,8 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) /* FIXME: (deleted) ? */ path = d_path(&dcs->path, kbuf, PAGE_SIZE); + mutex_unlock(&dcookie_mutex); + if (IS_ERR(path)) { err = PTR_ERR(path); goto out_free; @@ -194,6 +196,7 @@ SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len) out_free: kfree(kbuf); + return err; out: mutex_unlock(&dcookie_mutex); return err; -- cgit v1.2.3 From c592a7073796de1cd221f957ab693c19d383423f Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Fri, 3 Jun 2011 12:06:19 +0530 Subject: cifs: fix the kernel release version in the default security warning message When ntlm security mechanim is used, the message that warns about the upgrade to ntlmv2 got the kernel release version wrong (Blame it on Linus :). Fix it. Signed-off-by: Suresh Jayaraman Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6d88b82537c3..84c730701c09 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1976,7 +1976,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) warned_on_ntlm = true; cERROR(1, "default security mechanism requested. The default " "security mechanism will be upgraded from ntlm to " - "ntlmv2 in kernel release 2.6.41"); + "ntlmv2 in kernel release 3.1"); } ses->overrideSecFlg = volume_info->secFlg; -- cgit v1.2.3 From 5f0b23eeba2d9105944148e5a85b0bfb34a8ecf5 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Fri, 3 Jun 2011 14:19:01 +0530 Subject: cifs: make CIFS depend on CRYPTO_ECB When CONFIG_CRYPTO_ECB is not set, trying to mount a CIFS share with NTLM security resulted in mount failure with the following error: "CIFS VFS: could not allocate des crypto API" Seems like a leftover from commit 43988d7. Signed-off-by: Suresh Jayaraman CC: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 1cd4c3a1862d..66c68ab9e7df 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -7,6 +7,7 @@ config CIFS select CRYPTO_MD5 select CRYPTO_HMAC select CRYPTO_ARC4 + select CRYPTO_ECB select CRYPTO_DES help This is the client VFS module for the Common Internet File System -- cgit v1.2.3 From 9e1f1de02c2275d7172e18dc4e7c2065777611bf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 3 Jun 2011 18:24:58 -0400 Subject: more conservative S_NOSEC handling Caching "we have already removed suid/caps" was overenthusiastic as merged. On network filesystems we might have had suid/caps set on another client, silently picked by this client on revalidate, all of that *without* clearing the S_NOSEC flag. AFAICS, the only reasonably sane way to deal with that is * new superblock flag; unless set, S_NOSEC is not going to be set. * local block filesystems set it in their ->mount() (more accurately, mount_bdev() does, so does btrfs ->mount(), users of mount_bdev() other than local block ones clear it) * if any network filesystem (or a cluster one) wants to use S_NOSEC, it'll need to set MS_NOSEC in sb->s_flags *AND* take care to clear S_NOSEC when inode attribute changes are picked from other clients. It's not an earth-shattering hole (anybody that can set suid on another client will almost certainly be able to write to the file before doing that anyway), but it's a bug that needs fixing. Signed-off-by: Al Viro --- fs/btrfs/super.c | 2 +- fs/fuse/inode.c | 2 ++ fs/ocfs2/super.c | 2 +- fs/super.c | 2 +- include/linux/fs.h | 3 ++- mm/filemap.c | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9b2e7e5bc3ef..d158b672a2d2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -819,7 +819,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; + s->s_flags = flags | MS_NOSEC; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cc6ec4b2f0ff..38f84cd48b67 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -921,6 +921,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; + sb->s_flags &= ~MS_NOSEC; + if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index cdbaf5e97308..56f61027236b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1072,7 +1072,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OCFS2_SUPER_MAGIC; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, diff --git a/fs/super.c b/fs/super.c index c75593953c52..ab3d672db0de 100644 --- a/fs/super.c +++ b/fs/super.c @@ -822,7 +822,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; + s->s_flags = flags | MS_NOSEC; s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); diff --git a/include/linux/fs.h b/include/linux/fs.h index c55d6b7cd5d6..646a1836152a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -208,6 +208,7 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ +#define MS_NOSEC (1<<28) #define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) @@ -2591,7 +2592,7 @@ static inline int is_sxid(mode_t mode) static inline void inode_has_no_xattr(struct inode *inode) { - if (!is_sxid(inode->i_mode)) + if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) inode->i_flags |= S_NOSEC; } diff --git a/mm/filemap.c b/mm/filemap.c index d7b10578a64b..a8251a8d3457 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2000,7 +2000,7 @@ int file_remove_suid(struct file *file) error = security_inode_killpriv(dentry); if (!error && killsuid) error = __remove_suid(dentry, killsuid); - if (!error) + if (!error && (inode->i_sb->s_flags & MS_NOSEC)) inode->i_flags |= S_NOSEC; return error; -- cgit v1.2.3 From 1bc8779349d6278e2713a1ff94418c2a6746a791 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 28 May 2011 21:57:55 +0200 Subject: btrfs: scrub: don't reuse bios and pages The current scrub implementation reuses bios and pages as often as possible, allocating them only on start and releasing them when finished. This leads to more problems with the block layer than it's worth. The elevator gets confused when there are more pages added to the bio than bi_size suggests. This patch completely rips out the reuse of bios and pages and allocates them freshly for each submit. Signed-off-by: Arne Jansen Signed-off-by: Chris Maosn --- fs/btrfs/scrub.c | 114 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 65 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6dfed0c27ac3..2d1f8909a8e1 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -117,33 +117,37 @@ static void scrub_free_csums(struct scrub_dev *sdev) } } +static void scrub_free_bio(struct bio *bio) +{ + int i; + struct page *last_page = NULL; + + if (!bio) + return; + + for (i = 0; i < bio->bi_vcnt; ++i) { + if (bio->bi_io_vec[i].bv_page == last_page) + continue; + last_page = bio->bi_io_vec[i].bv_page; + __free_page(last_page); + } + bio_put(bio); +} + static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) { int i; - int j; - struct page *last_page; if (!sdev) return; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { struct scrub_bio *sbio = sdev->bios[i]; - struct bio *bio; if (!sbio) break; - bio = sbio->bio; - if (bio) { - last_page = NULL; - for (j = 0; j < bio->bi_vcnt; ++j) { - if (bio->bi_io_vec[j].bv_page == last_page) - continue; - last_page = bio->bi_io_vec[j].bv_page; - __free_page(last_page); - } - bio_put(bio); - } + scrub_free_bio(sbio->bio); kfree(sbio); } @@ -156,8 +160,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) { struct scrub_dev *sdev; int i; - int j; - int ret; struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; sdev = kzalloc(sizeof(*sdev), GFP_NOFS); @@ -165,7 +167,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) goto nomem; sdev->dev = dev; for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { - struct bio *bio; struct scrub_bio *sbio; sbio = kzalloc(sizeof(*sbio), GFP_NOFS); @@ -173,32 +174,10 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) goto nomem; sdev->bios[i] = sbio; - bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); - if (!bio) - goto nomem; - sbio->index = i; sbio->sdev = sdev; - sbio->bio = bio; sbio->count = 0; sbio->work.func = scrub_checksum; - bio->bi_private = sdev->bios[i]; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_sector = 0; - bio->bi_bdev = dev->bdev; - bio->bi_size = 0; - - for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) { - struct page *page; - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) - goto nomem; - } - WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO); if (i != SCRUB_BIOS_PER_DEV-1) sdev->bios[i]->next_free = i + 1; @@ -394,6 +373,7 @@ static void scrub_bio_end_io(struct bio *bio, int err) struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; sbio->err = err; + sbio->bio = bio; btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); } @@ -453,6 +433,8 @@ static void scrub_checksum(struct btrfs_work *work) } out: + scrub_free_bio(sbio->bio); + sbio->bio = NULL; spin_lock(&sdev->list_lock); sbio->next_free = sdev->first_free; sdev->first_free = sbio->index; @@ -583,25 +565,50 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; + struct bio *bio; + int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - sbio->bio->bi_sector = sbio->physical >> 9; - sbio->bio->bi_size = sbio->count * PAGE_SIZE; - sbio->bio->bi_next = NULL; - sbio->bio->bi_flags |= 1 << BIO_UPTODATE; - sbio->bio->bi_comp_cpu = -1; - sbio->bio->bi_bdev = sdev->dev->bdev; + bio = bio_alloc(GFP_NOFS, sbio->count); + if (!bio) + goto nomem; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + + for (i = 0; i < sbio->count; ++i) { + struct page *page; + int ret; + + page = alloc_page(GFP_NOFS); + if (!page) + goto nomem; + + ret = bio_add_page(bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + goto nomem; + } + } + sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(0, sbio->bio); + submit_bio(READ, bio); return 0; + +nomem: + scrub_free_bio(bio); + + return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -633,7 +640,11 @@ again: sbio->logical = logical; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - scrub_submit(sdev); + int ret; + + ret = scrub_submit(sdev); + if (ret) + return ret; goto again; } sbio->spag[sbio->count].flags = flags; @@ -645,8 +656,13 @@ again: memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); } ++sbio->count; - if (sbio->count == SCRUB_PAGES_PER_BIO || force) - scrub_submit(sdev); + if (sbio->count == SCRUB_PAGES_PER_BIO || force) { + int ret; + + ret = scrub_submit(sdev); + if (ret) + return ret; + } return 0; } -- cgit v1.2.3 From 17aca1c987cff89dc4279371857035da902c8854 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jun 2011 01:13:45 -0400 Subject: Btrfs: fix uninit variable in the delayed inode code The nitems counter needs to start at zero Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index b46d94d1dea8..c61c32cf0f71 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -678,6 +678,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&head); next = item; + nitems = 0; /* * count the number of the continuous items that we can insert in batch -- cgit v1.2.3 From 211f96c24f117fcc6e9e2431e40d92f4de22625e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jun 2011 01:26:53 -0400 Subject: Btrfs: make sure we don't overflow the free space cache crc page The free space cache uses only one page for crcs right now, which means we can't have a cache file bigger than the crcs we can fit in the first page. This adds a check to enforce that restriction. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index dd38d4c3a599..1cb72394498c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -590,10 +590,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + /* Since the first page has all of our checksums and our generation we + * need to calculate the offset into the page that we can start writing + * our entries. + */ + first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); + filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & ~(root->sectorsize - 1), (u64)-1); + /* make sure we don't overflow that first page */ + if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) { + /* this is really the same as running out of space, where we also return 0 */ + printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n"); + ret = 0; + goto out_update; + } + /* We need a checksum per page. */ crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); if (!crc) @@ -605,12 +620,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, return -1; } - /* Since the first page has all of our checksums and our generation we - * need to calculate the offset into the page that we can start writing - * our entries. - */ - first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64); - /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) cluster = list_entry(block_group->cluster_list.next, @@ -872,12 +881,14 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, ret = 1; out_free: + kfree(checksums); + kfree(pages); + +out_update: if (ret != 1) { invalidate_inode_pages2_range(inode->i_mapping, 0, index); BTRFS_I(inode)->generation = 0; } - kfree(checksums); - kfree(pages); btrfs_update_inode(trans, root, inode); return ret; } -- cgit v1.2.3 From ca456ae280c0646e1e571c3b9a3834c55e90adfe Mon Sep 17 00:00:00 2001 From: liubo Date: Wed, 1 Jun 2011 09:42:49 +0000 Subject: Btrfs: don't save the inode cache in non-FS roots This adds extra checks to make sure the inode map we are caching really belongs to a FS root instead of a special relocation tree. It prevents crashes during balancing operations. Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 3262cd17a12f..04f7199facb4 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -388,6 +388,12 @@ int btrfs_save_ino_cache(struct btrfs_root *root, int prealloc; bool retry = false; + /* only fs tree and subvol/snap needs ino cache */ + if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID && + (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID || + root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; -- cgit v1.2.3 From 5f3f302a6f4cb74906c05fad1d03fc5e95c7e5af Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Mon, 30 May 2011 08:36:16 +0000 Subject: btrfs: false BUG_ON when degraded In degraded mode the struct btrfs_device of missing devs don't have device->name set. A kstrdup of NULL correctly returns NULL. Don't BUG in this case. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c48214ef5c09..da541dfca2e3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -504,7 +504,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) BUG_ON(!new_device); memcpy(new_device, device, sizeof(*new_device)); new_device->name = kstrdup(device->name, GFP_NOFS); - BUG_ON(!new_device->name); + BUG_ON(device->name && !new_device->name); new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; -- cgit v1.2.3 From d132a538d258f8f52fd0cd8b5017755f4e915386 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 31 May 2011 19:33:33 +0000 Subject: Btrfs: don't save the inode cache if we are deleting this root With xfstest 254 I can panic the box every time with the inode number caching stuff on. This is because we clean the inodes out when we delete the subvolume, but then we write out the inode cache which adds an inode to the subvolume inode tree, and then when it gets evicted again the root gets added back on the dead roots list and is deleted again, so we have a double free. To stop this from happening just return 0 if refs is 0 (and we're not the tree root since tree root always has refs of 0). With this fix 254 no longer panics. Thanks, Signed-off-by: Josef Bacik Tested-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 04f7199facb4..2d0d50067a7b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -394,6 +394,11 @@ int btrfs_save_ino_cache(struct btrfs_root *root, root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID)) return 0; + /* Don't save inode cache if we are deleting this root */ + if (btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; -- cgit v1.2.3 From a4689d2bd3b00dcf5c4320f06e0ab88810fbff9c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 31 May 2011 17:08:14 +0000 Subject: btrfs: use btrfs_ino to access inode number commit 4cb5300bc ("Btrfs: add mount -o auto_defrag") accesses inode number directly while it should use the helper with the new inode number allocator. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- fs/btrfs/ioctl.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e3a1b0c2394c..982b5ea9762f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -144,7 +144,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!defrag) return -ENOMEM; - defrag->ino = inode->i_ino; + defrag->ino = btrfs_ino(inode); defrag->transid = transid; defrag->root = root->root_key.objectid; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 74c80595d707..ac37040e426a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -706,16 +706,17 @@ static int find_new_extents(struct btrfs_root *root, struct btrfs_file_extent_item *extent; int type; int ret; + u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - min_key.objectid = inode->i_ino; + min_key.objectid = ino; min_key.type = BTRFS_EXTENT_DATA_KEY; min_key.offset = *off; - max_key.objectid = inode->i_ino; + max_key.objectid = ino; max_key.type = (u8)-1; max_key.offset = (u64)-1; @@ -726,7 +727,7 @@ static int find_new_extents(struct btrfs_root *root, path, 0, newer_than); if (ret != 0) goto none; - if (min_key.objectid != inode->i_ino) + if (min_key.objectid != ino) goto none; if (min_key.type != BTRFS_EXTENT_DATA_KEY) goto none; -- cgit v1.2.3 From e7786c3ae517b2c433edc91714e86be770e9f1ce Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 28 May 2011 20:58:38 +0000 Subject: btrfs: scrub: add explicit plugging With the removal of the implicit plugging scrub ends up doing more and smaller I/O than necessary. This patch adds explicit plugging per chunk. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 2d1f8909a8e1..1204eab94028 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -348,9 +348,6 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, int ret; DECLARE_COMPLETION_ONSTACK(complete); - /* we are going to wait on this IO */ - rw |= REQ_SYNC; - bio = bio_alloc(GFP_NOFS, 1); bio->bi_bdev = bdev; bio->bi_sector = sector; @@ -359,6 +356,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, bio->bi_private = &complete; submit_bio(rw, bio); + /* this will also unplug the queue */ wait_for_completion(&complete); ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -743,6 +741,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; + struct blk_plug plug; u64 flags; int ret; int slot; @@ -847,6 +846,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, * the scrub. This might currently (crc32) end up to be about 1MB */ start_stripe = 0; + blk_start_plug(&plug); again: logical = base + offset + start_stripe * increment; for (i = start_stripe; i < nstripes; ++i) { @@ -988,6 +988,7 @@ next: scrub_submit(sdev); out: + blk_finish_plug(&plug); btrfs_free_path(path); return ret < 0 ? ret : 0; } -- cgit v1.2.3 From 4b9465cb9e3859186eefa1ca3b990a5849386320 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 3 Jun 2011 09:36:29 -0400 Subject: Btrfs: add mount -o inode_cache This makes the inode map cache default to off until we fix the overflow problem when the free space crcs don't fit inside a single page. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/free-space-cache.c | 6 ++++++ fs/btrfs/inode-map.c | 20 ++++++++++++++++++++ fs/btrfs/super.c | 8 +++++++- 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8f98c2005715..4958ef5417d6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1340,6 +1340,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) +#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 1cb72394498c..bffa5c4a633b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2536,6 +2536,9 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + /* * If we're unmounting then just return, since this does a search on the * normal root and not the commit root and we could deadlock. @@ -2575,6 +2578,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct inode *inode; int ret; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode)) return 0; diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2d0d50067a7b..cb79b8975c9f 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,6 +38,9 @@ static int caching_kthread(void *data) int slot; int ret; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -141,6 +144,9 @@ static void start_caching(struct btrfs_root *root) int ret; u64 objectid; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + spin_lock(&root->cache_lock); if (root->cached != BTRFS_CACHE_NO) { spin_unlock(&root->cache_lock); @@ -178,6 +184,9 @@ static void start_caching(struct btrfs_root *root) int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) { + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return btrfs_find_free_objectid(root, objectid); + again: *objectid = btrfs_find_ino_for_alloc(root); @@ -201,6 +210,10 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + again: if (root->cached == BTRFS_CACHE_FINISHED) { __btrfs_add_free_space(ctl, objectid, 1); @@ -250,6 +263,9 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) struct rb_node *n; u64 count; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return; + while (1) { n = rb_first(rbroot); if (!n) @@ -399,9 +415,13 @@ int btrfs_save_ino_cache(struct btrfs_root *root, root != root->fs_info->tree_root) return 0; + if (!btrfs_test_opt(root, INODE_MAP_CACHE)) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; + again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 28e3cb2607ff..3559d0b3518a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -160,7 +160,8 @@ enum { Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err, + Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, + Opt_inode_cache, Opt_err, }; static match_table_t tokens = { @@ -192,6 +193,7 @@ static match_table_t tokens = { {Opt_enospc_debug, "enospc_debug"}, {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, + {Opt_inode_cache, "inode_cache"}, {Opt_err, NULL}, }; @@ -360,6 +362,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); break; + case Opt_inode_cache: + printk(KERN_INFO "btrfs: enabling inode map caching\n"); + btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); + break; case Opt_clear_cache: printk(KERN_INFO "btrfs: force clearing of disk cache\n"); btrfs_set_opt(info->mount_opt, CLEAR_CACHE); -- cgit v1.2.3 From 7841cb2898f66a73062c64d0ef5733dde7279e46 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 31 May 2011 18:07:27 +0200 Subject: btrfs: add helper for fs_info->closing wrap checking of filesystem 'closing' flag and fix a few missing memory barriers. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/extent-tree.c | 3 +-- fs/btrfs/file.c | 4 ++-- fs/btrfs/free-space-cache.c | 10 ++++------ fs/btrfs/inode-map.c | 3 +-- fs/btrfs/inode.c | 3 +-- fs/btrfs/scrub.c | 2 +- fs/btrfs/transaction.c | 2 +- 8 files changed, 20 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4958ef5417d6..8490ee063709 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2354,6 +2354,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* + * Get synced with close_ctree() + */ + smp_mb(); + return fs_info->closing; +} + /* root-item.c */ int btrfs_find_root_ref(struct btrfs_root *tree_root, struct btrfs_path *path, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c9173a7827b0..5b9b6b6df242 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -366,8 +366,7 @@ again: nritems = btrfs_header_nritems(leaf); while (1) { - smp_mb(); - if (fs_info->closing > 1) { + if (btrfs_fs_closing(fs_info) > 1) { last = (u64)-1; break; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 982b5ea9762f..fa4ef18b66b1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -129,7 +129,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, if (!btrfs_test_opt(root, AUTO_DEFRAG)) return 0; - if (root->fs_info->closing) + if (btrfs_fs_closing(root->fs_info)) return 0; if (BTRFS_I(inode)->in_defrag) @@ -229,7 +229,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) first_ino = defrag->ino + 1; rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) goto next_free; spin_unlock(&fs_info->defrag_inodes_lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bffa5c4a633b..ad144736a5fd 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -98,7 +98,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, return inode; spin_lock(&block_group->lock); - if (!root->fs_info->closing) { + if (!btrfs_fs_closing(root->fs_info)) { block_group->inode = igrab(inode); block_group->iref = 1; } @@ -493,8 +493,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, * If we're unmounting then just return, since this does a search on the * normal root and not the commit root and we could deadlock. */ - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) return 0; /* @@ -2513,7 +2512,7 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, return inode; spin_lock(&root->cache_lock); - if (!root->fs_info->closing) + if (!btrfs_fs_closing(root->fs_info)) root->cache_inode = igrab(inode); spin_unlock(&root->cache_lock); @@ -2543,8 +2542,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) * If we're unmounting then just return, since this does a search on the * normal root and not the commit root and we could deadlock. */ - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) return 0; path = btrfs_alloc_path(); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cb79b8975c9f..b4087e0fa871 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -62,8 +62,7 @@ again: goto out; while (1) { - smp_mb(); - if (fs_info->closing) + if (btrfs_fs_closing(fs_info)) goto out; leaf = path->nodes[0]; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a83e44bf3206..02ff4a1b968b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4266,8 +4266,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) if (BTRFS_I(inode)->dummy_inode) return 0; - smp_mb(); - if (root->fs_info->closing && is_free_space_inode(root, inode)) + if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) nolock = true; if (wbc->sync_mode == WB_SYNC_ALL) { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 1204eab94028..df50fd1eca8f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1183,7 +1183,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, int ret; struct btrfs_device *dev; - if (root->fs_info->closing) + if (btrfs_fs_closing(root->fs_info)) return -EINVAL; /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2d5c6d2aa4e4..dd719662340e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -817,7 +817,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) btrfs_btree_balance_dirty(info->tree_root, nr); cond_resched(); - if (root->fs_info->closing || ret != -EAGAIN) + if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) break; } root->defrag_running = 0; -- cgit v1.2.3 From aa0467d8d2a00e75b2bb6a56a4ee6d70c5d1928f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 3 Jun 2011 16:29:08 +0200 Subject: btrfs: fix uninitialized variable warning With Linus' tree, today's linux-next build (powercp ppc64_defconfig) produced this warning: fs/btrfs/delayed-inode.c: In function 'btrfs_delayed_update_inode': fs/btrfs/delayed-inode.c:1598:6: warning: 'ret' may be used uninitialized in this function Introduced by commit 16cdcec736cd ("btrfs: implement delayed inode items operation"). This fixes a bug in btrfs_update_inode(): if the returned value from btrfs_delayed_update_inode is a nonzero garbage, inode stat data are not updated and several call paths may hit a BUG_ON or fail with strange code. Reported-by: Stephen Rothwell Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c61c32cf0f71..6462c29d2d37 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1595,7 +1595,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_delayed_node *delayed_node; - int ret; + int ret = 0; delayed_node = btrfs_get_or_create_delayed_node(inode); if (IS_ERR(delayed_node)) -- cgit v1.2.3 From 957df4535d06a8e009101239937ca5e50a6218c6 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 6 Jun 2011 11:33:12 +0400 Subject: possible memory corruption in cifs_parse_mount_options() error path after mountdata check frees uninitialized mountdata_copy Signed-off-by: Vasily Averin Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 84c730701c09..fb31c2c1bb17 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -784,7 +784,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, struct smb_vol *vol) { char *value, *data, *end; - char *mountdata_copy, *options; + char *mountdata_copy = NULL, *options; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; -- cgit v1.2.3 From 243e2dd38e1b322b29a1714034cc60b84d3d5e07 Mon Sep 17 00:00:00 2001 From: Darren Salt Date: Mon, 6 Jun 2011 16:58:16 +0000 Subject: CIFS ACL support needs CONFIG_KEYS, so depend on it Build fails if CONFIG_KEYS is not selected. Signed-off-by: Darren Salt Reviewed-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 66c68ab9e7df..53ed1ad2c112 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -149,7 +149,7 @@ config CIFS_FSCACHE config CIFS_ACL bool "Provide CIFS ACL support (EXPERIMENTAL)" - depends on EXPERIMENTAL && CIFS_XATTR + depends on EXPERIMENTAL && CIFS_XATTR && KEYS help Allows to fetch CIFS/NTFS ACL from the server. The DACL blob is handed over to the application/caller. -- cgit v1.2.3 From b084f598df36b62dfae83c10ed17f0b66b50f442 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 31 May 2011 12:24:58 -0400 Subject: nfsd: fix dependency of nfsd on auth_rpcgss Commit b0b0c0a26e84 "nfsd: add proc file listing kernel's gss_krb5 enctypes" added an nunnecessary dependency of nfsd on the auth_rpcgss module. It's a little ad hoc, but since the only piece of information nfsd needs from rpcsec_gss_krb5 is a single static string, one solution is just to share it with an include file. Cc: stable@kernel.org Reported-by: Michael Guntsche Cc: Kevin Coffman Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 19 ++++++------------- include/linux/sunrpc/gss_krb5_enctypes.h | 4 ++++ net/sunrpc/auth_gss/gss_krb5_mech.c | 3 ++- 3 files changed, 12 insertions(+), 14 deletions(-) create mode 100644 include/linux/sunrpc/gss_krb5_enctypes.h (limited to 'fs') diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1f5eae40f34e..2b1449dd2f49 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "idmap.h" #include "nfsd.h" @@ -189,18 +190,10 @@ static struct file_operations export_features_operations = { .release = single_release, }; -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) static int supported_enctypes_show(struct seq_file *m, void *v) { - struct gss_api_mech *k5mech; - - k5mech = gss_mech_get_by_name("krb5"); - if (k5mech == NULL) - goto out; - if (k5mech->gm_upcall_enctypes != NULL) - seq_printf(m, k5mech->gm_upcall_enctypes); - gss_mech_put(k5mech); -out: + seq_printf(m, KRB5_SUPPORTED_ENCTYPES); return 0; } @@ -215,7 +208,7 @@ static struct file_operations supported_enctypes_ops = { .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); @@ -1427,9 +1420,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, diff --git a/include/linux/sunrpc/gss_krb5_enctypes.h b/include/linux/sunrpc/gss_krb5_enctypes.h new file mode 100644 index 000000000000..ec6234eee89c --- /dev/null +++ b/include/linux/sunrpc/gss_krb5_enctypes.h @@ -0,0 +1,4 @@ +/* + * Dumb way to share this static piece of information with nfsd + */ +#define KRB5_SUPPORTED_ENCTYPES "18,17,16,23,3,1,2" diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 0a9a2ec2e469..c3b75333b821 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -750,7 +751,7 @@ static struct gss_api_mech gss_kerberos_mech = { .gm_ops = &gss_kerberos_ops, .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), .gm_pfs = gss_kerberos_pfs, - .gm_upcall_enctypes = "18,17,16,23,3,1,2", + .gm_upcall_enctypes = KRB5_SUPPORTED_ENCTYPES, }; static int __init init_kerberos_module(void) -- cgit v1.2.3 From be1f4084b4824301e640e81d63b6275cd99ee6a1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 6 Jun 2011 11:22:17 -0700 Subject: nfsd: v4 support requires CRYPTO nfsd V4 support uses crypto interfaces, so select CRYPTO to fix build errors in 2.6.39: ERROR: "crypto_destroy_tfm" [fs/nfsd/nfsd.ko] undefined! ERROR: "crypto_alloc_base" [fs/nfsd/nfsd.ko] undefined! Reported-by: Wakko Warner Signed-off-by: Randy Dunlap Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 18b3e8975fe0..fbb2a5ef5817 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -82,6 +82,7 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS + select CRYPTO help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). -- cgit v1.2.3 From 7d751f6f8c679f51b73d01a1b5269347a929004c Mon Sep 17 00:00:00 2001 From: Casey Bodley Date: Fri, 3 Jun 2011 12:21:23 -0400 Subject: nfsd: link returns nfserr_delay when breaking lease fix for commit 4795bb37effb7b8fe77e2d2034545d062d3788a8, nfsd: break lease on unlink, link, and rename if the LINK operation breaks a delegation, it returns NFS4ERR_NOENT (which is not a valid error in rfc 5661) instead of NFS4ERR_DELAY. the return value of nfsd_break_lease() in nfsd_link() must be converted from host_err to err Signed-off-by: Casey Bodley Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d5718273bb32..f3fb61b69a1e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1660,8 +1660,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (!dold->d_inode) goto out_drop_write; host_err = nfsd_break_lease(dold->d_inode); - if (host_err) + if (host_err) { + err = nfserrno(host_err); goto out_drop_write; + } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); -- cgit v1.2.3 From 9c4843ea576107a3c1fb94f2f758f198e9fe9e54 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 6 Jun 2011 15:40:23 -0400 Subject: cifs: silence printk when establishing first session on socket When signing is enabled, the first session that's established on a socket will cause a printk like this to pop: CIFS VFS: Unexpected SMB signature This is because the key exchange hasn't happened yet, so the signature field is bogus. Don't try to check the signature on the socket until the first session has been established. Also, eliminate the specific check for SMB_COM_NEGOTIATE since this check covers that case too. Cc: stable@kernel.org Cc: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index dfbd9f1f373d..5a0ee7f2af06 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -184,7 +184,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, if (cifs_pdu == NULL || server == NULL) return -EINVAL; - if (cifs_pdu->Command == SMB_COM_NEGOTIATE) + if (!server->session_estab) return 0; if (cifs_pdu->Command == SMB_COM_LOCKING_ANDX) { -- cgit v1.2.3 From 9054760ff585a7fa436599990b63a585ae89ff4d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 Jun 2011 19:22:56 +0100 Subject: lmLogOpen() broken failure exit Callers of lmLogOpen() expect it to return -E... on failure exits, which is what it returns, except for the case of blkdev_get_by_dev() failure. It that case lmLogOpen() return the error with the wrong sign... Signed-off-by: Al Viro Acked-by: Dave Kleikamp --- fs/jfs/jfs_logmgr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 278e3fb40b71..583636f745e5 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1123,7 +1123,7 @@ int lmLogOpen(struct super_block *sb) bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, log); if (IS_ERR(bdev)) { - rc = -PTR_ERR(bdev); + rc = PTR_ERR(bdev); goto free; } -- cgit v1.2.3 From e6bc45d65df8599fdbae73be9cec4ceed274db53 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 6 Jun 2011 19:19:40 -0400 Subject: vfs: make unlink() and rmdir() return ENOENT in preference to EROFS If user space attempts to remove a non-existent file or directory, and the file system is mounted read-only, return ENOENT instead of EROFS. Either error code is arguably valid/correct, but ENOENT is a more specific error message. Reported-by: Michael Tokarev Signed-off-by: "Theodore Ts'o" Signed-off-by: Al Viro --- fs/namei.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index e2e4e8d032ee..9802345df5e7 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2624,6 +2624,10 @@ static long do_rmdir(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; + if (!dentry->d_inode) { + error = -ENOENT; + goto exit3; + } error = mnt_want_write(nd.path.mnt); if (error) goto exit3; @@ -2709,11 +2713,10 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (nd.last.name[nd.last.len]) - goto slashes; inode = dentry->d_inode; - if (inode) - ihold(inode); + if (nd.last.name[nd.last.len] || !inode) + goto slashes; + ihold(inode); error = mnt_want_write(nd.path.mnt); if (error) goto exit2; -- cgit v1.2.3 From 70b666c3b4cb2b96098d80e6f515e4bc6d37db5a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 27 May 2011 09:24:26 -0700 Subject: ceph: use ihold when we already have an inode ref We should use ihold whenever we already have a stable inode ref, even when we aren't holding i_lock. This avoids adding new and unnecessary locking dependencies. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 2 +- fs/ceph/caps.c | 10 ++++------ fs/ceph/dir.c | 11 +++++++---- fs/ceph/export.c | 4 ++-- fs/ceph/file.c | 3 ++- fs/ceph/inode.c | 18 ++++++++++-------- fs/ceph/ioctl.c | 6 ++++-- fs/ceph/locks.c | 3 ++- fs/ceph/snap.c | 2 +- fs/ceph/xattr.c | 6 ++++-- 10 files changed, 37 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 33da49dc3cc6..5a3953db8118 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -453,7 +453,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) int err; struct inode *inode = page->mapping->host; BUG_ON(!inode); - igrab(inode); + ihold(inode); err = writepage_nounlock(page, wbc); unlock_page(page); iput(inode); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 1f72b00447c4..f605753c8fe9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2940,14 +2940,12 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) while (!list_empty(&mdsc->cap_dirty)) { ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info, i_dirty_item); - inode = igrab(&ci->vfs_inode); + inode = &ci->vfs_inode; + ihold(inode); dout("flush_dirty_caps %p\n", inode); spin_unlock(&mdsc->cap_dirty_lock); - if (inode) { - ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, - NULL); - iput(inode); - } + ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH, NULL); + iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 33729e822bb9..ef8f08c343e8 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -308,7 +308,8 @@ more: req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_dentry = dget(filp->f_dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; @@ -787,10 +788,12 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); - if (err) + if (err) { d_drop(dentry); - else if (!req->r_reply_info.head->is_dentry) - d_instantiate(dentry, igrab(old_dentry->d_inode)); + } else if (!req->r_reply_info.head->is_dentry) { + ihold(old_dentry->d_inode); + d_instantiate(dentry, old_dentry->d_inode); + } ceph_mdsc_put_request(req); return err; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index a610d3d67488..f67b687550de 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -109,7 +109,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; if (inode) - igrab(inode); + ihold(inode); ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(-ESTALE); @@ -167,7 +167,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb, err = ceph_mdsc_do_request(mdsc, NULL, req); inode = req->r_target_inode; if (inode) - igrab(inode); + ihold(inode); ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(err ? err : -ESTALE); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 203252d88d9f..8c5ac4e72832 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -191,7 +191,8 @@ int ceph_open(struct inode *inode, struct file *file) err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; err = ceph_mdsc_do_request(mdsc, parent_inode, req); if (!err) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 70b6a4839c38..d8858e96ab18 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1101,10 +1101,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); + ihold(in); } else if (ceph_ino(in) == vino.ino && ceph_snap(in) == vino.snap) { - igrab(in); + ihold(in); } else { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dn, in, ceph_ino(in), ceph_snap(in), @@ -1144,7 +1144,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, goto done; } req->r_dentry = dn; /* may have spliced */ - igrab(in); + ihold(in); rinfo->head->is_dentry = 1; /* fool notrace handlers */ } @@ -1328,7 +1328,7 @@ void ceph_queue_writeback(struct inode *inode) if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); } @@ -1353,7 +1353,7 @@ void ceph_queue_invalidate(struct inode *inode) if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); } @@ -1477,7 +1477,7 @@ void ceph_queue_vmtruncate(struct inode *inode) if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - igrab(inode); + ihold(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); @@ -1738,7 +1738,8 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) __mark_inode_dirty(inode, inode_dirty_flags); if (mask) { - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; @@ -1779,7 +1780,8 @@ int ceph_do_getattr(struct inode *inode, int mask) req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_num_caps = 1; req->r_args.getattr.mask = cpu_to_le32(mask); err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 8888c9ba68db..ef0b5f48e13a 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -73,7 +73,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -135,7 +136,8 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 476b329867d4..7f0f72c53f7e 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -23,7 +23,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 24067d68a554..54b14de2e729 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -722,7 +722,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc) ci = list_first_entry(&mdsc->snap_flush_list, struct ceph_inode_info, i_snap_flush_item); inode = &ci->vfs_inode; - igrab(inode); + ihold(inode); spin_unlock(&mdsc->snap_flush_lock); spin_lock(&inode->i_lock); __ceph_flush_snaps(ci, &session, 0); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index f2b628696180..f42d730f1b66 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -665,7 +665,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_args.setxattr.flags = cpu_to_le32(flags); @@ -795,7 +796,8 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); - req->r_inode = igrab(inode); + req->r_inode = inode; + ihold(inode); req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; req->r_path2 = kstrdup(name, GFP_NOFS); -- cgit v1.2.3 From c3cd62839aaa2cdb2b99687c9e44f1b300a4aece Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 1 Jun 2011 16:08:44 -0700 Subject: ceph: fix short sync reads from the OSD If we get a short read from the OSD because the object is small, we need to zero the remainder of the buffer. For O_DIRECT reads, the attempted range is not trimmed to i_size by the VFS, so we were actually looping indefinitely. Fix by trimming by i_size, and the unconditionally zeroing the trailing range. Reported-by: Jeff Wu Signed-off-by: Sage Weil --- fs/ceph/file.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 8c5ac4e72832..b654f403139e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -283,7 +283,7 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool align_to_pages, + int *checkeof, bool o_direct, unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -308,7 +308,7 @@ static int striped_read(struct inode *inode, io_align = off & ~PAGE_MASK; more: - if (align_to_pages) + if (o_direct) page_align = (pos - io_align + buf_align) & ~PAGE_MASK; else page_align = pos & ~PAGE_MASK; @@ -346,20 +346,22 @@ more: } if (was_short) { - /* was original extent fully inside i_size? */ - if (pos + left <= inode->i_size) { - dout("zero tail\n"); - ceph_zero_page_vector_range(page_off + read, len - read, + /* did we bounce off eof? */ + if (pos + left > inode->i_size) + *checkeof = 1; + + /* zero trailing bytes (inside i_size) */ + if (left > 0 && pos < inode->i_size) { + if (pos + left > inode->i_size) + left = inode->i_size - pos; + + dout("zero tail %d\n", left); + ceph_zero_page_vector_range(page_off + read, left, pages); - read = len; - goto out; + read += left; } - - /* check i_size */ - *checkeof = 1; } -out: if (ret >= 0) ret = read; dout("striped_read returns %d\n", ret); @@ -659,7 +661,7 @@ out: /* hit EOF or hole? */ if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, reading more\n"); + dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); read += ret; base += ret; len -= ret; -- cgit v1.2.3 From 0e98728fa32d338907631349a8cc2afa07c0cb9a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 7 Jun 2011 20:40:35 -0700 Subject: ceph: fix ENOENT logic in striped_read Getting ENOENT is equivalent to reading 0 bytes. Make that correction before setting up the hit_stripe and was_short flags. Fixes the following case: dd if=/dev/zero of=/mnt/fs_depot/dd3 bs=1 seek=1048576 count=0 dd if=/mnt/fs_depot/dd3 of=/root/ddout1 skip=8 bs=500 count=2 iflag=direct Reported-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b654f403139e..9542f07d0b93 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -318,10 +318,10 @@ more: ci->i_truncate_seq, ci->i_truncate_size, page_pos, pages_left, page_align); - hit_stripe = this_len < left; - was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) ret = 0; + hit_stripe = this_len < left; + was_short = ret >= 0 && ret < this_len; dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); -- cgit v1.2.3 From 0c1f91f27140cf3b6e38dc4e892adac241c73a20 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 25 May 2011 14:56:12 -0700 Subject: ceph: unwind canceled flock state If we request a lock and then abort (e.g., ^C), we need to send a matching unlock request to the MDS to unwind our lock attempt to avoid indefinitely blocking other clients. Reported-by: Brian Chrisman Signed-off-by: Sage Weil --- fs/ceph/locks.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 7f0f72c53f7e..80576d05d687 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -33,11 +33,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, length = fl->fl_end - fl->fl_start + 1; dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d", (int)lock_type, + "length: %llu, wait: %d, type: %d", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type); - req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); @@ -71,7 +70,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, } ceph_mdsc_put_request(req); dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type, + "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, (int)operation, (u64)fl->fl_pid, fl->fl_start, length, wait, fl->fl_type, err); return err; @@ -110,16 +109,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { - /* undo! This should only happen if the kernel detects - * local deadlock. */ + /* undo! This should only happen if + * the kernel detects local + * deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, file, CEPH_LOCK_UNLOCK, 0, fl); - dout("got %d on posix_lock_file, undid lock", err); + dout("got %d on posix_lock_file, undid lock", + err); } } - } else { - dout("mds returned error code %d", err); + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); } return err; } @@ -156,8 +159,11 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } - } else { - dout("mds error code %d", err); + } else if (err == -ERESTARTSYS) { + dout("undoing lock\n"); + ceph_lock_message(CEPH_LOCK_FLOCK, + CEPH_MDS_OP_SETFILELOCK, + file, CEPH_LOCK_UNLOCK, 0, fl); } return err; } -- cgit v1.2.3 From 83fb086e0ecd879a4676cf12fc7afc1f9ecd1784 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 8 Jun 2011 07:35:24 -0400 Subject: cifs: trivial: add space in fsc error message Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index fb31c2c1bb17..bb659eb73810 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1391,7 +1391,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, "/proc/fs/cifs/LookupCacheEnabled to 0\n"); } else if (strnicmp(data, "fsc", 3) == 0) { #ifndef CONFIG_CIFS_FSCACHE - cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE" + cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE " "kernel config option set"); goto cifs_parse_mount_err; #endif -- cgit v1.2.3 From 86d4a77ba3dc4ace238a0556541a41df2bd71d49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 May 2011 13:03:16 -0400 Subject: Btrfs: cache bitmaps when searching for a cluster If we are looking for a cluster in a particularly sparse or fragmented block group, we will do a lot of looping through the free space tree looking for various things, and if we need to look at bitmaps we will endup doing the whole dance twice. So instead add the bitmap entries to a temporary list so if we have to do the bitmap search we can just look through the list of entries we've found quickly instead of having to loop through the entire tree again. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 54 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ad144736a5fd..930c07f79b3d 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2144,6 +2144,7 @@ again: */ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2166,6 +2167,8 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * extent entry. */ while (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); node = rb_next(&entry->offset_index); if (!node) return -ENOSPC; @@ -2185,8 +2188,12 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, return -ENOSPC; entry = rb_entry(node, struct btrfs_free_space, offset_index); - if (entry->bitmap) + if (entry->bitmap) { + if (list_empty(&entry->list)) + list_add_tail(&entry->list, bitmaps); continue; + } + /* * we haven't filled the empty size and the window is * very large. reset and try again @@ -2240,6 +2247,7 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, */ static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; @@ -2250,10 +2258,39 @@ static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, if (ctl->total_bitmaps == 0) return -ENOSPC; + /* + * First check our cached list of bitmaps and see if there is an entry + * here that will work. + */ + list_for_each_entry(entry, bitmaps, list) { + if (entry->bytes < min_bytes) + continue; + ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, + bytes, min_bytes); + if (!ret) + return 0; + } + + /* + * If we do have entries on our list and we are here then we didn't find + * anything, so go ahead and get the next entry after the last entry in + * this list and start the search from there. + */ + if (!list_empty(bitmaps)) { + entry = list_entry(bitmaps->prev, struct btrfs_free_space, + list); + node = rb_next(&entry->offset_index); + if (!node) + return -ENOSPC; + entry = rb_entry(node, struct btrfs_free_space, offset_index); + goto search; + } + entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); if (!entry) return -ENOSPC; +search: node = &entry->offset_index; do { entry = rb_entry(node, struct btrfs_free_space, offset_index); @@ -2284,6 +2321,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct list_head bitmaps; + struct btrfs_free_space *entry, *tmp; u64 min_bytes; int ret; @@ -2322,11 +2361,16 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes, - min_bytes); + INIT_LIST_HEAD(&bitmaps); + ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, + bytes, min_bytes); if (ret) - ret = setup_cluster_bitmap(block_group, cluster, offset, - bytes, min_bytes); + ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, + offset, bytes, min_bytes); + + /* Clear our temporary list */ + list_for_each_entry_safe(entry, tmp, &bitmaps, list) + list_del_init(&entry->list); if (!ret) { atomic_inc(&block_group->count); -- cgit v1.2.3 From 3de85bb95cc50d0977cbb7a0c605e894be4c790d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 May 2011 13:07:37 -0400 Subject: Btrfs: noinline the cluster searching functions When profiling the find cluster code it's hard to tell where we are spending our time because the bitmap and non-bitmap functions get inlined by the compiler, so make that not happen. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 930c07f79b3d..f56caacfd8ad 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2142,10 +2142,11 @@ again: /* * This searches the block group for just extents to fill the cluster with. */ -static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - struct list_head *bitmaps, - u64 offset, u64 bytes, u64 min_bytes) +static noinline int +setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *first = NULL; @@ -2245,10 +2246,11 @@ static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, * This specifically looks for bitmaps that may work in the cluster, we assume * that we have already failed to find extents that will work. */ -static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, - struct btrfs_free_cluster *cluster, - struct list_head *bitmaps, - u64 offset, u64 bytes, u64 min_bytes) +static noinline int +setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + struct list_head *bitmaps, u64 offset, u64 bytes, + u64 min_bytes) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; -- cgit v1.2.3 From f2bb8f5cfb3bce595b2de251ed7638047fc4e530 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 25 May 2011 13:10:16 -0400 Subject: Btrfs: don't commit the transaction if we dont have enough pinned bytes I noticed when running an enospc test that we would get stuck committing the transaction in check_data_space even though we truly didn't have enough space. So check to see if bytes_pinned is bigger than num_bytes, if it's not don't commit the transaction. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b9b6b6df242..0d0a3fe77bb7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3089,6 +3089,13 @@ alloc: } goto again; } + + /* + * If we have less pinned bytes than we want to allocate then + * don't bother committing the transaction, it won't help us. + */ + if (data_sinfo->bytes_pinned < bytes) + committed = 1; spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ -- cgit v1.2.3 From 2cdc342c204dba69ca3b2ec43d8e6ff41ed920b8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 27 May 2011 14:07:49 -0400 Subject: Btrfs: fix bitmap regression In cleaning up the clustering code I accidently introduced a regression by adding bitmap entries to the cluster rb tree. The problem is if we've maxed out the number of bitmaps we can have for the block group we can only add free space to the bitmaps, but since the bitmap is on the cluster we can't find it and we try to create another one. This would result in a panic because the total bitmaps was bigger than the max bitmaps that were allowed. This patch fixes this by checking to see if we have a cluster, and then looking at the cluster rb tree to see if it has a bitmap entry and if it does and that space belongs to that bitmap, go ahead and add it to that bitmap. I could hit this panic every time with an fs_mark test within a couple of minutes. With this patch I no longer hit the panic and fs_mark goes to completion. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 88 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f56caacfd8ad..8258ccf85dbd 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1417,6 +1417,23 @@ again: return 0; } +static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + u64 bytes_to_set = 0; + u64 end; + + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); + + bytes_to_set = min(end - offset, bytes); + + bitmap_set_bits(ctl, info, offset, bytes_to_set); + + return bytes_to_set; + +} + static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { @@ -1453,12 +1470,18 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } +static struct btrfs_free_space_op free_space_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_free_space *bitmap_info; + struct btrfs_block_group_cache *block_group = NULL; int added = 0; - u64 bytes, offset, end; + u64 bytes, offset, bytes_added; int ret; bytes = info->bytes; @@ -1467,6 +1490,47 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, if (!ctl->op->use_bitmap(ctl, info)) return 0; + if (ctl->op == &free_space_op) + block_group = ctl->private; + + /* + * Since we link bitmaps right into the cluster we need to see if we + * have a cluster here, and if so and it has our bitmap we need to add + * the free space to that bitmap. + */ + if (block_group && !list_empty(&block_group->cluster_list)) { + struct btrfs_free_cluster *cluster; + struct rb_node *node; + struct btrfs_free_space *entry; + + cluster = list_entry(block_group->cluster_list.next, + struct btrfs_free_cluster, + block_group_list); + spin_lock(&cluster->lock); + node = rb_first(&cluster->root); + if (!node) { + spin_unlock(&cluster->lock); + goto again; + } + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (!entry->bitmap) { + spin_unlock(&cluster->lock); + goto again; + } + + if (entry->offset == offset_to_bitmap(ctl, offset)) { + bytes_added = add_bytes_to_bitmap(ctl, entry, + offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + } + spin_unlock(&cluster->lock); + if (!bytes) { + ret = 1; + goto out; + } + } again: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); @@ -1475,19 +1539,10 @@ again: goto new_bitmap; } - end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); - - if (offset >= bitmap_info->offset && offset + bytes > end) { - bitmap_set_bits(ctl, bitmap_info, offset, end - offset); - bytes -= end - offset; - offset = end; - added = 0; - } else if (offset >= bitmap_info->offset && offset + bytes <= end) { - bitmap_set_bits(ctl, bitmap_info, offset, bytes); - bytes = 0; - } else { - BUG(); - } + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; + offset += bytes_added; + added = 0; if (!bytes) { ret = 1; @@ -1766,11 +1821,6 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, "\n", count); } -static struct btrfs_free_space_op free_space_op = { - .recalc_thresholds = recalculate_thresholds, - .use_bitmap = use_bitmap, -}; - void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; -- cgit v1.2.3 From 723bda2083d44edbd6be0f0b09f902120dc07442 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 27 May 2011 16:11:38 -0400 Subject: Btrfs: fix the allocator loop logic I was testing with empty_cluster = 0 to try and reproduce a problem and kept hitting early enospc panics. This was because our loop logic was a little confused. So this is what I did 1) Make the loop variable the ultimate decider on wether we should loop again isntead of checking to see if we had an uncached bg, empty size or empty cluster. 2) Increment loop before checking to see what we are on to make the loop definitions make more sense. 3) If we are on the chunk alloc loop don't set empty_size/empty_cluster to 0 unless we didn't actually allocate a chunk. If we did allocate a chunk we should be able to easily setup a new cluster so clearing empty_size/empty_cluster makes us less efficient. This kept me from hitting panics while trying to reproduce the other problem. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 48 +++++++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d0a3fe77bb7..b42efc2ded51 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5218,9 +5218,7 @@ loop: * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try * again */ - if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && - (found_uncached_bg || empty_size || empty_cluster || - allowed_chunk_alloc)) { + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; @@ -5260,32 +5258,36 @@ loop: goto search; } - if (loop < LOOP_CACHING_WAIT) { - loop++; - goto search; - } + loop++; if (loop == LOOP_ALLOC_CHUNK) { - empty_size = 0; - empty_cluster = 0; - } + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, + CHUNK_ALLOC_LIMITED); + allowed_chunk_alloc = 0; + if (ret == 1) + done_chunk_alloc = 1; + } else if (!done_chunk_alloc && + space_info->force_alloc == + CHUNK_ALLOC_NO_FORCE) { + space_info->force_alloc = CHUNK_ALLOC_LIMITED; + } - if (allowed_chunk_alloc) { - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, - CHUNK_ALLOC_LIMITED); - allowed_chunk_alloc = 0; - done_chunk_alloc = 1; - } else if (!done_chunk_alloc && - space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) { - space_info->force_alloc = CHUNK_ALLOC_LIMITED; + /* + * We didn't allocate a chunk, go ahead and drop the + * empty size and loop again. + */ + if (!done_chunk_alloc) + loop = LOOP_NO_EMPTY_SIZE; } - if (loop < LOOP_NO_EMPTY_SIZE) { - loop++; - goto search; + if (loop == LOOP_NO_EMPTY_SIZE) { + empty_size = 0; + empty_cluster = 0; } - ret = -ENOSPC; + + goto search; } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { -- cgit v1.2.3 From f6a398298d34af66ec3a2d82a44a4dbc5277357d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 6 Jun 2011 10:50:35 -0400 Subject: Btrfs: fix duplicate checking logic When merging my code into the integration test the second check for duplicate entries got screwed up. This patch fixes it by dropping ret2 and just using ret for the return value, and checking if we got an error before adding the bitmap to the local list. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8258ccf85dbd..38f3fd923043 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -250,7 +250,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, pgoff_t index = 0; unsigned long first_page_offset; int num_checksums; - int ret = 0, ret2; + int ret = 0; INIT_LIST_HEAD(&bitmaps); @@ -421,11 +421,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } spin_lock(&ctl->tree_lock); - ret2 = link_free_space(ctl, e); + ret = link_free_space(ctl, e); ctl->total_bitmaps++; ctl->op->recalc_thresholds(ctl); spin_unlock(&ctl->tree_lock); - list_add_tail(&e->list, &bitmaps); if (ret) { printk(KERN_ERR "Duplicate entries in " "free space cache, dumping\n"); @@ -434,6 +433,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, page_cache_release(page); goto free_cache; } + list_add_tail(&e->list, &bitmaps); } num_entries--; -- cgit v1.2.3 From 25b8b936ed44814a5ce6fc3b2a21401f33cd56f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 Jun 2011 14:36:54 -0400 Subject: Btrfs: don't map extent buffer if path->skip_locking is set Arne's scrub stuff exposed a problem with mapping the extent buffer in reada_for_search. He searches the commit root with multiple threads and with skip_locking set, so we can race and overwrite node->map_token since node isn't locked. So fix this so that we only map the extent buffer if we don't already have a map_token and skip_locking isn't set. Without this patch scrub would panic almost immediately, with the patch it doesn't panic anymore. Thanks, Reported-by: Arne Jansen Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d84089349c82..2e667868e0d2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1228,6 +1228,7 @@ static void reada_for_search(struct btrfs_root *root, u32 nr; u32 blocksize; u32 nscan = 0; + bool map = true; if (level != 1) return; @@ -1249,8 +1250,11 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; + if (node->map_token || path->skip_locking) + map = false; + while (1) { - if (!node->map_token) { + if (map && !node->map_token) { unsigned long offset = btrfs_node_key_ptr_offset(nr); map_private_extent_buffer(node, offset, sizeof(struct btrfs_key_ptr), @@ -1277,7 +1281,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; @@ -1289,7 +1293,7 @@ static void reada_for_search(struct btrfs_root *root, if ((nread > 65536 || nscan > 32)) break; } - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; } -- cgit v1.2.3 From 3473f3c06a36865ae05993041fff35ee928342a7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 9 Jun 2011 10:15:17 -0400 Subject: Btrfs: unlock the trans lock properly In btrfs_wait_for_commit if we came upon a transaction that had committed we just exited, but that's bad since we are holding the trans_lock. So break instead so that the lock is dropped. Thanks, Reported-by: David Sterba Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dd719662340e..6b2e4786d189 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -349,7 +349,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) list) { if (t->in_commit) { if (t->commit_done) - goto out; + break; cur_trans = t; atomic_inc(&cur_trans->use_count); break; -- cgit v1.2.3 From dac853ae89043f1b7752875300faf614de43c74b Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 9 Jun 2011 20:05:18 +0200 Subject: exec: delay address limit change until point of no return Unconditionally changing the address limit to USER_DS and not restoring it to its old value in the error path is wrong because it prevents us using kernel memory on repeated calls to this function. This, in fact, breaks the fallback of hard coded paths to the init program from being ever successful if the first candidate fails to load. With this patch applied switching to USER_DS is delayed until the point of no return is reached which makes it possible to have a multi-arch rootfs with one arch specific init binary for each of the (hard coded) probed paths. Since the address limit is already set to USER_DS when start_thread() will be invoked, this redundancy can be safely removed. Signed-off-by: Mathias Krause Cc: Al Viro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - fs/exec.c | 5 +---- 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af47..a3d0dc59067b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -245,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { set_user_gs(regs, 0); regs->fs = 0; - set_fs(USER_DS); regs->ds = __USER_DS; regs->es = __USER_DS; regs->ss = __USER_DS; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6c9dd922ac0d..ca6f7ab8df33 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -338,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; - set_fs(USER_DS); /* * Free the old FP and other extended state */ diff --git a/fs/exec.c b/fs/exec.c index ea5f748906a8..97e0d52d72fd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1093,6 +1093,7 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ + set_fs(USER_DS); current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD); flush_thread(); current->personality &= ~bprm->per_clear; @@ -1357,10 +1358,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) if (retval) return retval; - /* kernel module loader fixup */ - /* so we don't try to load run modprobe in kernel space. */ - set_fs(USER_DS); - retval = audit_bprm(bprm); if (retval) return retval; -- cgit v1.2.3 From ad3e34bba4b64ab8e1f5ea1a17768e1a0d9648ea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 8 Jun 2011 14:45:50 -0400 Subject: Btrfs: don't map extent buffer if path->skip_locking is set Arne's scrub stuff exposed a problem with mapping the extent buffer in reada_for_search. He searches the commit root with multiple threads and with skip_locking set, so we can race and overwrite node->map_token since node isn't locked. So fix this so that we only map the extent buffer if we don't already have a map_token and skip_locking isn't set. Without this patch scrub would panic almost immediately, with the patch it doesn't panic anymore. Thanks, Reported-by: Arne Jansen Signed-off-by: Josef Bacik --- fs/btrfs/ctree.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d84089349c82..2e667868e0d2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1228,6 +1228,7 @@ static void reada_for_search(struct btrfs_root *root, u32 nr; u32 blocksize; u32 nscan = 0; + bool map = true; if (level != 1) return; @@ -1249,8 +1250,11 @@ static void reada_for_search(struct btrfs_root *root, nritems = btrfs_header_nritems(node); nr = slot; + if (node->map_token || path->skip_locking) + map = false; + while (1) { - if (!node->map_token) { + if (map && !node->map_token) { unsigned long offset = btrfs_node_key_ptr_offset(nr); map_private_extent_buffer(node, offset, sizeof(struct btrfs_key_ptr), @@ -1277,7 +1281,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; @@ -1289,7 +1293,7 @@ static void reada_for_search(struct btrfs_root *root, if ((nread > 65536 || nscan > 32)) break; } - if (node->map_token) { + if (map && node->map_token) { unmap_extent_buffer(node, node->map_token, KM_USER1); node->map_token = NULL; } -- cgit v1.2.3 From 8c51032f978bac5bec5dae0c5de4f85db97c1cc9 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 3 Jun 2011 10:09:26 +0200 Subject: btrfs: scrub: errors in tree enumeration due to the semantics of btrfs_search_slot the path can point to an invalid slot when ret > 0. This condition went unnoticed, which in turn could have led to an incomplete scrubbing. Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 57 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index df50fd1eca8f..d5a4108cedaf 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -804,18 +804,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid != logical) { - ret = btrfs_previous_item(root, path, 0, - BTRFS_EXTENT_ITEM_KEY); - if (ret < 0) - goto out; - } + goto out_noplug; + /* + * we might miss half an extent here, but that doesn't matter, + * as it's only the prefetch + */ while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -824,7 +818,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, if (ret == 0) continue; if (ret < 0) - goto out; + goto out_noplug; break; } @@ -906,15 +900,20 @@ again: ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; - - l = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, slot); - if (key.objectid != logical) { + if (ret > 0) { ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY); if (ret < 0) goto out; + if (ret > 0) { + /* there's no smaller item, so stick with the + * larger one */ + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, &key, + path, 0, 0); + if (ret < 0) + goto out; + } } while (1) { @@ -989,6 +988,7 @@ next: out: blk_finish_plug(&plug); +out_noplug: btrfs_free_path(path); return ret < 0 ? ret : 0; } @@ -1064,8 +1064,15 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; - ret = 0; + break; + if (ret > 0) { + if (path->slots[0] >= + btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + } l = path->nodes[0]; slot = path->slots[0]; @@ -1075,7 +1082,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) if (found_key.objectid != sdev->dev->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) break; if (found_key.offset >= end) @@ -1104,7 +1111,7 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) cache = btrfs_lookup_block_group(fs_info, chunk_offset); if (!cache) { ret = -ENOENT; - goto out; + break; } ret = scrub_chunk(sdev, chunk_tree, chunk_objectid, chunk_offset, length); @@ -1116,9 +1123,13 @@ int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end) btrfs_release_path(path); } -out: btrfs_free_path(path); - return ret; + + /* + * ret can still be 1 from search_slot or next_leaf, + * that's not an error + */ + return ret < 0 ? ret : 0; } static noinline_for_stack int scrub_supers(struct scrub_dev *sdev) -- cgit v1.2.3 From 632dd772fcbde2ba37c0e8983bd38ef4a1eac906 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 12:07:07 +0200 Subject: btrfs: reinitialize scrub workers Scrub starts the workers each time a scrub starts and stops them after it finished. This patch adds an initialization for the workers before each start, otherwise the workers behave strangely. Signed-off-by: Arne Jansen --- fs/btrfs/disk-io.c | 2 -- fs/btrfs/scrub.c | 6 +++++- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a203d363184d..7bbbfebe47e4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1668,8 +1668,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->scrub_pause_wait); init_rwsem(&fs_info->scrub_super_lock); fs_info->scrub_workers_refcnt = 0; - btrfs_init_workers(&fs_info->scrub_workers, "scrub", - fs_info->thread_pool_size, &fs_info->generic_worker); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index d5a4108cedaf..92cac19388ed 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1166,8 +1166,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_root *root) struct btrfs_fs_info *fs_info = root->fs_info; mutex_lock(&fs_info->scrub_lock); - if (fs_info->scrub_workers_refcnt == 0) + if (fs_info->scrub_workers_refcnt == 0) { + btrfs_init_workers(&fs_info->scrub_workers, "scrub", + fs_info->thread_pool_size, &fs_info->generic_worker); + fs_info->scrub_workers.idle_thresh = 4; btrfs_start_workers(&fs_info->scrub_workers, 1); + } ++fs_info->scrub_workers_refcnt; mutex_unlock(&fs_info->scrub_lock); -- cgit v1.2.3 From 6eef3125886df260ca0e8758d141308152226f6a Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 10 Jun 2011 13:04:58 +0200 Subject: btrfs: remove unneeded includes from scrub.c Signed-off-by: Arne Jansen --- fs/btrfs/scrub.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 92cac19388ed..a8d03d5efb5d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -16,13 +16,7 @@ * Boston, MA 021110-1307, USA. */ -#include -#include -#include #include -#include -#include -#include #include "ctree.h" #include "volumes.h" #include "disk-io.h" -- cgit v1.2.3 From 38e87880666091fe9c572a7a2ed2e771d97ca5aa Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 10 Jun 2011 16:36:57 -0400 Subject: Btrfs: make sure to recheck for bitmaps in clusters Josef recently changed the free extent cache to look in the block group cluster for any bitmaps before trying to add a new bitmap for the same offset. This avoids BUG_ON()s due covering duplicate ranges. But it didn't go quite far enough. A given free range might span between one or more bitmaps or free space entries. The code has looping to cover this, but it doesn't check for clustered bitmaps every time. This shuffles our gotos to check for a bitmap in the cluster for every new bitmap entry we try to add. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 38f3fd923043..9f985a429877 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1492,7 +1492,7 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, if (ctl->op == &free_space_op) block_group = ctl->private; - +again: /* * Since we link bitmaps right into the cluster we need to see if we * have a cluster here, and if so and it has our bitmap we need to add @@ -1510,13 +1510,13 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, node = rb_first(&cluster->root); if (!node) { spin_unlock(&cluster->lock); - goto again; + goto no_cluster_bitmap; } entry = rb_entry(node, struct btrfs_free_space, offset_index); if (!entry->bitmap) { spin_unlock(&cluster->lock); - goto again; + goto no_cluster_bitmap; } if (entry->offset == offset_to_bitmap(ctl, offset)) { @@ -1531,7 +1531,8 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, goto out; } } -again: + +no_cluster_bitmap: bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!bitmap_info) { -- cgit v1.2.3 From 38e880540f983045da7a00fbc50daad238207fc5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 10 Jun 2011 18:43:13 +0000 Subject: Btrfs: clear current->journal_info on async transaction commit Normally current->jouranl_info is cleared by commit_transaction. For an async snap or subvol creation, though, it runs in a work queue. Clear it in btrfs_commit_transaction_async() to avoid leaking a non-NULL journal_info when we return to userspace. When the actual commit runs in the other thread it won't care that it's current->journal_info is already NULL. Signed-off-by: Sage Weil Tested-by: Jim Schutt Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6b2e4786d189..2b3590b9fe98 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1118,8 +1118,11 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, wait_current_trans_commit_start_and_unblock(root, cur_trans); else wait_current_trans_commit_start(root, cur_trans); - put_transaction(cur_trans); + if (current->journal_info == trans) + current->journal_info = NULL; + + put_transaction(cur_trans); return 0; } -- cgit v1.2.3 From 9eb9104c665aae2401a1723c044669eb10240072 Mon Sep 17 00:00:00 2001 From: richard kennedy Date: Tue, 7 Jun 2011 10:46:32 +0000 Subject: btrfs: remove 64bit alignment padding to allow extent_buffer to fit into one fewer cacheline Reorder extent_buffer to remove 8 bytes of alignment padding on 64 bit builds. This shrinks its size to 128 bytes allowing it to fit into one fewer cache lines and allows more objects per slab in its kmem_cache. slabinfo extent_buffer reports :- before:- Sizes (bytes) Slabs ---------------------------------- Object : 136 Total : 123 SlabObj: 136 Full : 121 SlabSiz: 4096 Partial: 0 Loss : 0 CpuSlab: 2 Align : 8 Objects: 30 after :- Object : 128 Total : 4 SlabObj: 128 Full : 2 SlabSiz: 4096 Partial: 0 Loss : 0 CpuSlab: 2 Align : 8 Objects: 32 Signed-off-by: Richard Kennedy Signed-off-by: Chris Mason --- fs/btrfs/extent_io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4e8445a4757c..a11a92ee2d30 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -126,9 +126,9 @@ struct extent_buffer { unsigned long map_len; struct page *first_page; unsigned long bflags; - atomic_t refs; struct list_head leak_list; struct rcu_head rcu_head; + atomic_t refs; /* the spinlock is used to protect most operations */ spinlock_t lock; -- cgit v1.2.3 From 027ed2f0044e95a97ed34db2d55a9ca95ba84385 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Jun 2011 08:27:56 +0000 Subject: Btrfs: avoid stack bloat in btrfs_ioctl_fs_info() The size of struct btrfs_ioctl_fs_info_args is as big as 1KB, so don't declare the variable on stack. Signed-off-by: Li Zefan Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ac37040e426a..b793d112d1f6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2054,29 +2054,34 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { - struct btrfs_ioctl_fs_info_args fi_args; + struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; struct btrfs_device *next; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - fi_args.num_devices = fs_devices->num_devices; - fi_args.max_id = 0; - memcpy(&fi_args.fsid, root->fs_info->fsid, sizeof(fi_args.fsid)); + fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); + if (!fi_args) + return -ENOMEM; + + fi_args->num_devices = fs_devices->num_devices; + memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (device->devid > fi_args.max_id) - fi_args.max_id = device->devid; + if (device->devid > fi_args->max_id) + fi_args->max_id = device->devid; } mutex_unlock(&fs_devices->device_list_mutex); - if (copy_to_user(arg, &fi_args, sizeof(fi_args))) - return -EFAULT; + if (copy_to_user(arg, fi_args, sizeof(*fi_args))) + ret = -EFAULT; - return 0; + kfree(fi_args); + return ret; } static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) -- cgit v1.2.3 From 5be76758f35ec6578e5b9b150aa513ac26bd9c54 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 9 Jun 2011 10:02:51 +0000 Subject: btrfs: fix unlocked access of delalloc_inodes list_splice_init will make delalloc_inodes empty, but without a spinlock around, this may produce corrupted list head, accessed in many placess, The race window is very tight and nobody seems to have hit it so far. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a203d363184d..33b744a5ac03 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2911,9 +2911,8 @@ static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root) INIT_LIST_HEAD(&splice); - list_splice_init(&root->fs_info->delalloc_inodes, &splice); - spin_lock(&root->fs_info->delalloc_lock); + list_splice_init(&root->fs_info->delalloc_inodes, &splice); while (!list_empty(&splice)) { btrfs_inode = list_entry(splice.next, struct btrfs_inode, -- cgit v1.2.3 From 08d2f347e877e489ca098c87a6fd2e872fef9767 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Wed, 4 May 2011 16:18:50 +0200 Subject: Btrfs: fix extent state leak on failed nodatasum reads When encountering an EIO while reading from a nodatasum extent, we insert an error record into the inode's failure tree. btrfs_readpage_end_io_hook returns early for nodatasum inodes. We'd better clear the failure tree in that case, otherwise the kernel complains about BUG extent_state: Objects remaining on kmem_cache_close() on rmmod. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 02ff4a1b968b..113913ae36e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1986,7 +1986,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - return 0; + goto good; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { -- cgit v1.2.3 From 22b63a2971c5657dfc1bf4514f9410fc90c8b2c2 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Feb 2011 16:05:31 +0200 Subject: Btrfs - use %pU to print fsid Get rid of FIXME comment. Uuids from dmesg are now the same as uuids given by btrfs-progs. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index da541dfca2e3..1efa56e18f9b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -689,12 +689,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, transid = btrfs_super_generation(disk_super); if (disk_super->label[0]) printk(KERN_INFO "device label %s ", disk_super->label); - else { - /* FIXME, make a readl uuid parser */ - printk(KERN_INFO "device fsid %llx-%llx ", - *(unsigned long long *)disk_super->fsid, - *(unsigned long long *)(disk_super->fsid + 8)); - } + else + printk(KERN_INFO "device fsid %pU ", disk_super->fsid); printk(KERN_CONT "devid %llu transid %llu %s\n", (unsigned long long)devid, (unsigned long long)transid, path); ret = device_list_add(path, disk_super, devid, fs_devices_ret); -- cgit v1.2.3 From fe744fdb74f2417d8571faefa45f72b0ead25f89 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 25 May 2011 23:00:27 +0900 Subject: nilfs2: fix incorrect block address termination in node concatenation nilfs_btree_delete function wrongly terminates virtual block address of the btree node held by its parent at index 0. When concatenating the index-0 node with its right sibling node, nilfs_btree_delete terminates the block address of index-0 node instead of the right sibling node which should be deleted. This bug not only wears disk space in the long run, but also causes file system corruption. This will fix it. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/btree.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 7eafe468a29c..fd4f05b91942 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1356,20 +1356,19 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; - int pindex, level, ncmin, ncmax, ncblk, ret; + int pindex, dindex, level, ncmin, ncmax, ncblk, ret; ret = 0; stats->bs_nblocks = 0; ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); ncblk = nilfs_btree_nchildren_per_block(btree); - for (level = NILFS_BTREE_LEVEL_NODE_MIN; + for (level = NILFS_BTREE_LEVEL_NODE_MIN, dindex = path[level].bp_index; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(node, path[level].bp_index, - ncblk); + nilfs_btree_node_get_ptr(node, dindex, ncblk); ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) @@ -1383,6 +1382,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; + dindex = pindex; if (pindex > 0) { /* left sibling */ @@ -1421,6 +1421,14 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_concat_right; stats->bs_nblocks++; + /* + * When merging right sibling node + * into the current node, pointer to + * the right sibling node must be + * terminated instead. The adjustment + * below is required for that. + */ + dindex = pindex + 1; /* continue; */ } } else { @@ -1443,7 +1451,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(node, path[level].bp_index, + nilfs_btree_node_get_ptr(node, dindex, NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); -- cgit v1.2.3 From d40990537c9ea85dfe75dbe0ffba5e1002dfdf3f Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 25 May 2011 23:00:27 +0900 Subject: nilfs2: fix missing block address termination in btree node shrinking nilfs_btree_delete function does not terminate part of virtual block addresses when shrinking the last remaining child node into the root node. The missing address termination causes that dead btree node blocks persist and chip away free disk space. This fixes the leak bug on the btree node deletion. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/btree.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index fd4f05b91942..b2e3ff347620 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1346,6 +1346,11 @@ static void nilfs_btree_shrink(struct nilfs_bmap *btree, path[level].bp_bh = NULL; } +static void nilfs_btree_nop(struct nilfs_bmap *btree, + struct nilfs_btree_path *path, + int level, __u64 *keyp, __u64 *ptrp) +{ +} static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, @@ -1439,16 +1444,22 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_shrink; stats->bs_nblocks += 2; + level++; + path[level].bp_op = nilfs_btree_nop; + goto shrink_root_child; } else { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; + goto out; } - - goto out; - } } + /* child of the root node is deleted */ + path[level].bp_op = nilfs_btree_do_delete; + stats->bs_nblocks++; + +shrink_root_child: node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(node, dindex, @@ -1458,10 +1469,6 @@ static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, if (ret < 0) goto err_out_child_node; - /* child of the root node is deleted */ - path[level].bp_op = nilfs_btree_do_delete; - stats->bs_nblocks++; - /* success */ out: *levelp = level; -- cgit v1.2.3 From 071d73cfe5c38cf62338b952bd350ff3de541b75 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 10 Jun 2011 00:33:06 +0900 Subject: nilfs2: fix problem in setting checkpoint interval Checkpoint generation interval of nilfs goes wrong after user has changed the interval parameter with nilfs-tune tool. segctord starting. Construction interval = 5 seconds, CP frequency < 30 seconds segctord starting. Construction interval = 0 seconds, CP frequency < 30 seconds This turned out to be caused by a trivial bug in initialization code of log writer. This will fix it. Reported-by: Andrea Gelmini Signed-off-by: Ryusuke Konishi --- fs/nilfs2/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 141646e88fb5..bb24ab6c282f 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2573,7 +2573,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; if (nilfs->ns_interval) - sci->sc_interval = nilfs->ns_interval; + sci->sc_interval = HZ * nilfs->ns_interval; if (nilfs->ns_watermark) sci->sc_watermark = nilfs->ns_watermark; return sci; -- cgit v1.2.3 From 30b4caf5d73af5c99cf1b2b46496d8bc35330992 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 8 Jun 2011 03:56:44 +0000 Subject: Btrfs: use join_transaction in btrfs_evict_inode() The WARN_ON() in start_transaction() was triggered while balancing. The cause is btrfs_relocate_chunk() started a transaction and then called iput() on the inode that stores free space cache, and iput() called btrfs_start_transaction() again. Reported-by: Tsutomu Itoh Signed-off-by: Li Zefan Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 113913ae36e0..c15636b17874 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3646,7 +3646,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_i_size_write(inode, 0); while (1) { - trans = btrfs_start_transaction(root, 0); + trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = root->orphan_block_rsv; -- cgit v1.2.3 From ff78fca2a03c08436535d3f7152a30752d8131d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 09:42:17 -0400 Subject: fix leak in proc_set_super() set_anon_super() can fail... Signed-off-by: Al Viro --- fs/proc/root.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/root.c b/fs/proc/root.c index a9000e9cfee5..d6c3b416529b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -28,11 +28,12 @@ static int proc_test_super(struct super_block *sb, void *data) static int proc_set_super(struct super_block *sb, void *data) { - struct pid_namespace *ns; - - ns = (struct pid_namespace *)data; - sb->s_fs_info = get_pid_ns(ns); - return set_anon_super(sb, NULL); + int err = set_anon_super(sb, NULL); + if (!err) { + struct pid_namespace *ns = (struct pid_namespace *)data; + sb->s_fs_info = get_pid_ns(ns); + } + return err; } static struct dentry *proc_mount(struct file_system_type *fs_type, -- cgit v1.2.3 From b1c27ab3f93daede979f804afc38b189c2f17c60 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 10:07:03 -0400 Subject: ubifs: split allocation of ubifs_info into a separate function preparation to ubifs sget() race fixes Signed-off-by: Al Viro --- fs/ubifs/super.c | 87 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b5aeb5a8ebed..ddc3b02e8cf0 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1971,6 +1971,53 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode) return ERR_PTR(-EINVAL); } +static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) +{ + struct ubifs_info *c; + + c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + if (c) { + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + mutex_init(&c->write_reserve_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + c->no_chk_data_crc = 1; + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + } + return c; +} + static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { struct ubi_volume_desc *ubi = sb->s_fs_info; @@ -1978,49 +2025,11 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; int err; - c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + c = alloc_ubifs_info(ubi); if (!c) return -ENOMEM; - spin_lock_init(&c->cnt_lock); - spin_lock_init(&c->cs_lock); - spin_lock_init(&c->buds_lock); - spin_lock_init(&c->space_lock); - spin_lock_init(&c->orphan_lock); - init_rwsem(&c->commit_sem); - mutex_init(&c->lp_mutex); - mutex_init(&c->tnc_mutex); - mutex_init(&c->log_mutex); - mutex_init(&c->mst_mutex); - mutex_init(&c->umount_mutex); - mutex_init(&c->bu_mutex); - mutex_init(&c->write_reserve_mutex); - init_waitqueue_head(&c->cmt_wq); - c->buds = RB_ROOT; - c->old_idx = RB_ROOT; - c->size_tree = RB_ROOT; - c->orph_tree = RB_ROOT; - INIT_LIST_HEAD(&c->infos_list); - INIT_LIST_HEAD(&c->idx_gc); - INIT_LIST_HEAD(&c->replay_list); - INIT_LIST_HEAD(&c->replay_buds); - INIT_LIST_HEAD(&c->uncat_list); - INIT_LIST_HEAD(&c->empty_list); - INIT_LIST_HEAD(&c->freeable_list); - INIT_LIST_HEAD(&c->frdi_idx_list); - INIT_LIST_HEAD(&c->unclean_leb_list); - INIT_LIST_HEAD(&c->old_buds); - INIT_LIST_HEAD(&c->orph_list); - INIT_LIST_HEAD(&c->orph_new); - c->no_chk_data_crc = 1; - c->vfs_sb = sb; - c->highest_inum = UBIFS_FIRST_INO; - c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; - - ubi_get_volume_info(ubi, &c->vi); - ubi_get_device_info(c->vi.ubi_num, &c->di); - /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { -- cgit v1.2.3 From d251ed271d528afb407cc2ede30923e34cb209a5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 10:24:33 -0400 Subject: ubifs: fix sget races * allocate ubifs_info in ->mount(), fill it enough for sb_test() and set ->s_fs_info to it in set() callback passed to sget(). * do *not* free it in ->put_super(); do that in ->kill_sb() after we'd done kill_anon_super(). * don't free it in ubifs_fill_super() either - deactivate_locked_super() done by caller when ubifs_fill_super() returns an error will take care of that sucker. * get rid of kludge with passing ubi to ubifs_fill_super() in ->s_fs_info; we only need it in alloc_ubifs_info(), so ubifs_fill_super() will need only ubifs_info. Which it will find in ->s_fs_info just fine, no need to reassign anything... As the result, sb_test() becomes safe to apply to all superblocks that can be found by sget() (and a kludge with temporary use of ->s_fs_info to store a pointer to very different structure goes away). Signed-off-by: Al Viro --- fs/ubifs/super.c | 54 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index ddc3b02e8cf0..8c892c2d5300 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1848,7 +1848,6 @@ static void ubifs_put_super(struct super_block *sb) bdi_destroy(&c->bdi); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); - kfree(c); } static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) @@ -2020,21 +2019,16 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { - struct ubi_volume_desc *ubi = sb->s_fs_info; - struct ubifs_info *c; + struct ubifs_info *c = sb->s_fs_info; struct inode *root; int err; - c = alloc_ubifs_info(ubi); - if (!c) - return -ENOMEM; - c->vfs_sb = sb; /* Re-open the UBI device in read-write mode */ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); if (IS_ERR(c->ubi)) { err = PTR_ERR(c->ubi); - goto out_free; + goto out; } /* @@ -2100,24 +2094,29 @@ out_bdi: bdi_destroy(&c->bdi); out_close: ubi_close_volume(c->ubi); -out_free: - kfree(c); +out: return err; } static int sb_test(struct super_block *sb, void *data) { - dev_t *dev = data; + struct ubifs_info *c1 = data; struct ubifs_info *c = sb->s_fs_info; - return c->vi.cdev == *dev; + return c->vi.cdev == c1->vi.cdev; +} + +static int sb_set(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, const char *name, void *data) { struct ubi_volume_desc *ubi; - struct ubi_volume_info vi; + struct ubifs_info *c; struct super_block *sb; int err; @@ -2134,19 +2133,24 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } - ubi_get_volume_info(ubi, &vi); - dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); + c = alloc_ubifs_info(ubi); + if (!c) { + err = -ENOMEM; + goto out_close; + } + + dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); - sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev); + sb = sget(fs_type, sb_test, sb_set, c); if (IS_ERR(sb)) { err = PTR_ERR(sb); - goto out_close; + kfree(c); } if (sb->s_root) { struct ubifs_info *c1 = sb->s_fs_info; - + kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); if (!!(flags & MS_RDONLY) != c1->ro_mount) { @@ -2155,11 +2159,6 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, } } else { sb->s_flags = flags; - /* - * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is - * replaced by 'c'. - */ - sb->s_fs_info = ubi; err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) goto out_deact; @@ -2179,11 +2178,18 @@ out_close: return ERR_PTR(err); } +static void kill_ubifs_super(struct super_block *s) +{ + struct ubifs_info *c = s->s_fs_info; + kill_anon_super(s); + kfree(c); +} + static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .mount = ubifs_mount, - .kill_sb = kill_anon_super, + .kill_sb = kill_ubifs_super, }; /* -- cgit v1.2.3 From dde194a64bb5c3fd05d965775dc92e8a4920a53a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Jun 2011 16:01:21 -0400 Subject: afs: fix sget() races, close leak on umount * set ->s_fs_info in set() callback passed to sget() * allocate the thing and set it up enough for afs_test_super() before making it visible * have it freed in ->kill_sb() (current tree simply leaks it) * have ->put_super() leave ->s_fs_info->volume alone; it's too early for dropping it; do that from ->kill_sb() after having called kill_anon_super(). Signed-off-by: Al Viro --- fs/afs/super.c | 73 +++++++++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/afs/super.c b/fs/afs/super.c index fb240e8766d6..b7d48d7eaa1e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -31,8 +31,8 @@ static void afs_i_init_once(void *foo); static struct dentry *afs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); +static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); -static void afs_put_super(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); @@ -40,7 +40,7 @@ struct file_system_type afs_fs_type = { .owner = THIS_MODULE, .name = "afs", .mount = afs_mount, - .kill_sb = kill_anon_super, + .kill_sb = afs_kill_super, .fs_flags = 0, }; @@ -50,7 +50,6 @@ static const struct super_operations afs_super_ops = { .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .evict_inode = afs_evict_inode, - .put_super = afs_put_super, .show_options = generic_show_options, }; @@ -282,19 +281,25 @@ static int afs_parse_device_name(struct afs_mount_params *params, */ static int afs_test_super(struct super_block *sb, void *data) { - struct afs_mount_params *params = data; + struct afs_super_info *as1 = data; struct afs_super_info *as = sb->s_fs_info; - return as->volume == params->volume; + return as->volume == as1->volume; +} + +static int afs_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, void *data) +static int afs_fill_super(struct super_block *sb, + struct afs_mount_params *params) { - struct afs_mount_params *params = data; - struct afs_super_info *as = NULL; + struct afs_super_info *as = sb->s_fs_info; struct afs_fid fid; struct dentry *root = NULL; struct inode *inode = NULL; @@ -302,22 +307,11 @@ static int afs_fill_super(struct super_block *sb, void *data) _enter(""); - /* allocate a superblock info record */ - as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); - if (!as) { - _leave(" = -ENOMEM"); - return -ENOMEM; - } - - afs_get_volume(params->volume); - as->volume = params->volume; - /* fill in the superblock */ sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; - sb->s_fs_info = as; sb->s_bdi = &as->volume->bdi; /* allocate the root inode and dentry */ @@ -326,7 +320,7 @@ static int afs_fill_super(struct super_block *sb, void *data) fid.unique = 1; inode = afs_iget(sb, params->key, &fid, NULL, NULL); if (IS_ERR(inode)) - goto error_inode; + return PTR_ERR(inode); if (params->autocell) set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); @@ -342,16 +336,8 @@ static int afs_fill_super(struct super_block *sb, void *data) _leave(" = 0"); return 0; -error_inode: - ret = PTR_ERR(inode); - inode = NULL; error: iput(inode); - afs_put_volume(as->volume); - kfree(as); - - sb->s_fs_info = NULL; - _leave(" = %d", ret); return ret; } @@ -367,6 +353,7 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, struct afs_volume *vol; struct key *key; char *new_opts = kstrdup(options, GFP_KERNEL); + struct afs_super_info *as; int ret; _enter(",,%s,%p", dev_name, options); @@ -399,12 +386,22 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, ret = PTR_ERR(vol); goto error; } - params.volume = vol; + + /* allocate a superblock info record */ + as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); + if (!as) { + ret = -ENOMEM; + afs_put_volume(vol); + goto error; + } + as->volume = vol; /* allocate a deviceless superblock */ - sb = sget(fs_type, afs_test_super, set_anon_super, ¶ms); + sb = sget(fs_type, afs_test_super, afs_set_super, as); if (IS_ERR(sb)) { ret = PTR_ERR(sb); + afs_put_volume(vol); + kfree(as); goto error; } @@ -422,16 +419,16 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, } else { _debug("reuse"); ASSERTCMP(sb->s_flags, &, MS_ACTIVE); + afs_put_volume(vol); + kfree(as); } - afs_put_volume(params.volume); afs_put_cell(params.cell); kfree(new_opts); _leave(" = 0 [%p]", sb); return dget(sb->s_root); error: - afs_put_volume(params.volume); afs_put_cell(params.cell); key_put(params.key); kfree(new_opts); @@ -439,18 +436,12 @@ error: return ERR_PTR(ret); } -/* - * finish the unmounting process on the superblock - */ -static void afs_put_super(struct super_block *sb) +static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = sb->s_fs_info; - - _enter(""); - + kill_anon_super(sb); afs_put_volume(as->volume); - - _leave(""); + kfree(as); } /* -- cgit v1.2.3 From a685e08987d1edf1995b76511d4c98ea0e905377 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 8 Jun 2011 21:13:01 -0400 Subject: Delay struct net freeing while there's a sysfs instance refering to it * new refcount in struct net, controlling actual freeing of the memory * new method in kobj_ns_type_operations (->drop_ns()) * ->current_ns() semantics change - it's supposed to be followed by corresponding ->drop_ns(). For struct net in case of CONFIG_NET_NS it bumps the new refcount; net_drop_ns() decrements it and calls net_free() if the last reference has been dropped. Method renamed to ->grab_current_ns(). * old net_free() callers call net_drop_ns() instead. * sysfs_exit_ns() is gone, along with a large part of callchain leading to it; now that the references stored in ->ns[...] stay valid we do not need to hunt them down and replace them with NULL. That fixes problems in sysfs_lookup() and sysfs_readdir(), along with getting rid of sb->s_instances abuse. Note that struct net *shutdown* logics has not changed - net_cleanup() is called exactly when it used to be called. The only thing postponed by having a sysfs instance refering to that struct net is actual freeing of memory occupied by struct net. Signed-off-by: Al Viro --- fs/sysfs/mount.c | 37 +++++++++++-------------------------- fs/sysfs/sysfs.h | 2 +- include/linux/kobject_ns.h | 10 ++++++---- include/linux/sysfs.h | 7 ------- include/net/net_namespace.h | 10 +++++++++- lib/kobject.c | 26 +++++++++----------------- net/core/net-sysfs.c | 23 +++++++++-------------- net/core/net_namespace.c | 12 ++++++++++-- 8 files changed, 55 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 266895783b47..e34f0d99ea4e 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -95,6 +95,14 @@ static int sysfs_set_super(struct super_block *sb, void *data) return error; } +static void free_sysfs_super_info(struct sysfs_super_info *info) +{ + int type; + for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) + kobj_ns_drop(type, info->ns[type]); + kfree(info); +} + static struct dentry *sysfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -108,11 +116,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, return ERR_PTR(-ENOMEM); for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) - info->ns[type] = kobj_ns_current(type); + info->ns[type] = kobj_ns_grab_current(type); sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); if (IS_ERR(sb) || sb->s_fs_info != info) - kfree(info); + free_sysfs_super_info(info); if (IS_ERR(sb)) return ERR_CAST(sb); if (!sb->s_root) { @@ -131,12 +139,11 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, static void sysfs_kill_sb(struct super_block *sb) { struct sysfs_super_info *info = sysfs_info(sb); - /* Remove the superblock from fs_supers/s_instances * so we can't find it, before freeing sysfs_super_info. */ kill_anon_super(sb); - kfree(info); + free_sysfs_super_info(info); } static struct file_system_type sysfs_fs_type = { @@ -145,28 +152,6 @@ static struct file_system_type sysfs_fs_type = { .kill_sb = sysfs_kill_sb, }; -void sysfs_exit_ns(enum kobj_ns_type type, const void *ns) -{ - struct super_block *sb; - - mutex_lock(&sysfs_mutex); - spin_lock(&sb_lock); - list_for_each_entry(sb, &sysfs_fs_type.fs_supers, s_instances) { - struct sysfs_super_info *info = sysfs_info(sb); - /* - * If we see a superblock on the fs_supers/s_instances - * list the unmount has not completed and sb->s_fs_info - * points to a valid struct sysfs_super_info. - */ - /* Ignore superblocks with the wrong ns */ - if (info->ns[type] != ns) - continue; - info->ns[type] = NULL; - } - spin_unlock(&sb_lock); - mutex_unlock(&sysfs_mutex); -} - int __init sysfs_init(void) { int err = -ENOMEM; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3d28af31d863..2ed2404f3113 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -136,7 +136,7 @@ struct sysfs_addrm_cxt { * instance). */ struct sysfs_super_info { - const void *ns[KOBJ_NS_TYPES]; + void *ns[KOBJ_NS_TYPES]; }; #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info)) extern struct sysfs_dirent sysfs_root; diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h index 82cb5bf461fb..f66b065a8b5f 100644 --- a/include/linux/kobject_ns.h +++ b/include/linux/kobject_ns.h @@ -32,15 +32,17 @@ enum kobj_ns_type { /* * Callbacks so sysfs can determine namespaces - * @current_ns: return calling task's namespace + * @grab_current_ns: return a new reference to calling task's namespace * @netlink_ns: return namespace to which a sock belongs (right?) * @initial_ns: return the initial namespace (i.e. init_net_ns) + * @drop_ns: drops a reference to namespace */ struct kobj_ns_type_operations { enum kobj_ns_type type; - const void *(*current_ns)(void); + void *(*grab_current_ns)(void); const void *(*netlink_ns)(struct sock *sk); const void *(*initial_ns)(void); + void (*drop_ns)(void *); }; int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); @@ -48,9 +50,9 @@ int kobj_ns_type_registered(enum kobj_ns_type type); const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); -const void *kobj_ns_current(enum kobj_ns_type type); +void *kobj_ns_grab_current(enum kobj_ns_type type); const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk); const void *kobj_ns_initial(enum kobj_ns_type type); -void kobj_ns_exit(enum kobj_ns_type type, const void *ns); +void kobj_ns_drop(enum kobj_ns_type type, void *ns); #endif /* _LINUX_KOBJECT_NS_H */ diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index c3acda60eee0..e2696d76a599 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -177,9 +177,6 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd); void sysfs_put(struct sysfs_dirent *sd); -/* Called to clear a ns tag when it is no longer valid */ -void sysfs_exit_ns(enum kobj_ns_type type, const void *tag); - int __must_check sysfs_init(void); #else /* CONFIG_SYSFS */ @@ -338,10 +335,6 @@ static inline void sysfs_put(struct sysfs_dirent *sd) { } -static inline void sysfs_exit_ns(int type, const void *tag) -{ -} - static inline int __must_check sysfs_init(void) { return 0; diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 2bf9ed9ef26b..aef430d779bd 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -35,8 +35,11 @@ struct netns_ipvs; #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS) struct net { + atomic_t passive; /* To decided when the network + * namespace should be freed. + */ atomic_t count; /* To decided when the network - * namespace should be freed. + * namespace should be shut down. */ #ifdef NETNS_REFCNT_DEBUG atomic_t use_count; /* To track references we @@ -154,6 +157,9 @@ int net_eq(const struct net *net1, const struct net *net2) { return net1 == net2; } + +extern void net_drop_ns(void *); + #else static inline struct net *get_net(struct net *net) @@ -175,6 +181,8 @@ int net_eq(const struct net *net1, const struct net *net2) { return 1; } + +#define net_drop_ns NULL #endif diff --git a/lib/kobject.c b/lib/kobject.c index 82dc34c095c2..640bd98a4c8a 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -948,14 +948,14 @@ const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj) } -const void *kobj_ns_current(enum kobj_ns_type type) +void *kobj_ns_grab_current(enum kobj_ns_type type) { - const void *ns = NULL; + void *ns = NULL; spin_lock(&kobj_ns_type_lock); if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && kobj_ns_ops_tbl[type]) - ns = kobj_ns_ops_tbl[type]->current_ns(); + ns = kobj_ns_ops_tbl[type]->grab_current_ns(); spin_unlock(&kobj_ns_type_lock); return ns; @@ -987,23 +987,15 @@ const void *kobj_ns_initial(enum kobj_ns_type type) return ns; } -/* - * kobj_ns_exit - invalidate a namespace tag - * - * @type: the namespace type (i.e. KOBJ_NS_TYPE_NET) - * @ns: the actual namespace being invalidated - * - * This is called when a tag is no longer valid. For instance, - * when a network namespace exits, it uses this helper to - * make sure no sb's sysfs_info points to the now-invalidated - * netns. - */ -void kobj_ns_exit(enum kobj_ns_type type, const void *ns) +void kobj_ns_drop(enum kobj_ns_type type, void *ns) { - sysfs_exit_ns(type, ns); + spin_lock(&kobj_ns_type_lock); + if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) && + kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns) + kobj_ns_ops_tbl[type]->drop_ns(ns); + spin_unlock(&kobj_ns_type_lock); } - EXPORT_SYMBOL(kobject_get); EXPORT_SYMBOL(kobject_put); EXPORT_SYMBOL(kobject_del); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 11b98bc2aa8f..33d2a1fba131 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1179,9 +1179,14 @@ static void remove_queue_kobjects(struct net_device *net) #endif } -static const void *net_current_ns(void) +static void *net_grab_current_ns(void) { - return current->nsproxy->net_ns; + struct net *ns = current->nsproxy->net_ns; +#ifdef CONFIG_NET_NS + if (ns) + atomic_inc(&ns->passive); +#endif + return ns; } static const void *net_initial_ns(void) @@ -1196,22 +1201,13 @@ static const void *net_netlink_ns(struct sock *sk) struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, - .current_ns = net_current_ns, + .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, + .drop_ns = net_drop_ns, }; EXPORT_SYMBOL_GPL(net_ns_type_operations); -static void net_kobj_ns_exit(struct net *net) -{ - kobj_ns_exit(KOBJ_NS_TYPE_NET, net); -} - -static struct pernet_operations kobj_net_ops = { - .exit = net_kobj_ns_exit, -}; - - #ifdef CONFIG_HOTPLUG static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) { @@ -1339,6 +1335,5 @@ EXPORT_SYMBOL(netdev_class_remove_file); int netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); - register_pernet_subsys(&kobj_net_ops); return class_register(&net_class); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6c6b86d0da15..cdcbc3cb00a9 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -128,6 +128,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); atomic_set(&net->count, 1); + atomic_set(&net->passive, 1); #ifdef NETNS_REFCNT_DEBUG atomic_set(&net->use_count, 0); @@ -210,6 +211,13 @@ static void net_free(struct net *net) kmem_cache_free(net_cachep, net); } +void net_drop_ns(void *p) +{ + struct net *ns = p; + if (ns && atomic_dec_and_test(&ns->passive)) + net_free(ns); +} + struct net *copy_net_ns(unsigned long flags, struct net *old_net) { struct net *net; @@ -230,7 +238,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net) } mutex_unlock(&net_mutex); if (rv < 0) { - net_free(net); + net_drop_ns(net); return ERR_PTR(rv); } return net; @@ -286,7 +294,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); - net_free(net); + net_drop_ns(net); } } static DECLARE_WORK(net_cleanup_work, cleanup_net); -- cgit v1.2.3 From ac08aedfa5d3de0dcb3825b598d16c2e57991f54 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 11:28:50 -0400 Subject: Btrfs: check the return value from set_anon_super Al Viro noticed we weren't checking for set_anon_super failures. This adds the required checks. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9f68c6898653..20c111b3fa0d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1312,7 +1312,9 @@ again: spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); - set_anon_super(&root->anon_super, NULL); + ret = set_anon_super(&root->anon_super, NULL); + if (ret) + goto fail; if (btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; -- cgit v1.2.3 From f4c44016218a6fce357715b9bbabbbbe1f69853c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 11:30:47 -0400 Subject: Btrfs: drop the delalloc_bytes check in shrink_delalloc Even when delalloc_bytes is zero, we may need to sleep while waiting for delalloc space. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b42efc2ded51..1f61bf5b4960 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3314,10 +3314,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; - /* nothing to shrink - nothing to reclaim */ - if (root->fs_info->delalloc_bytes == 0) - return 0; - max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { -- cgit v1.2.3 From cd51875d53ae1459a2b09b4338166a218c0635a7 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 9 Jun 2011 12:58:53 +0400 Subject: CIFS: Fix sparse error cifs_sb_master_tlink was declared as inline, but without a definition. Remove the declaration and move the definition up. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index bb659eb73810..9190018f97e1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2149,7 +2149,10 @@ cifs_put_tlink(struct tcon_link *tlink) } static inline struct tcon_link * -cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb); +cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) +{ + return cifs_sb->master_tlink; +} static int compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) @@ -3484,12 +3487,6 @@ out: return tcon; } -static inline struct tcon_link * -cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) -{ - return cifs_sb->master_tlink; -} - struct cifs_tcon * cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb) { -- cgit v1.2.3 From 7fdbaa1b8daa1009b705985b903e3d2ebccad456 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 10 Jun 2011 16:14:57 -0400 Subject: cifs: don't allow cifs_reconnect to exit with NULL socket pointer It's possible for the following set of events to happen: cifsd calls cifs_reconnect which reconnects the socket. A userspace process then calls cifs_negotiate_protocol to handle the NEGOTIATE and gets a reply. But, while processing the reply, cifsd calls cifs_reconnect again. Eventually the GlobalMid_Lock is dropped and the reply from the earlier NEGOTIATE completes and the tcpStatus is set to CifsGood. cifs_reconnect then goes through and closes the socket and sets the pointer to zero, but because the status is now CifsGood, the new socket is not created and cifs_reconnect exits with the socket pointer set to NULL. Fix this by only setting the tcpStatus to CifsGood if the tcpStatus is CifsNeedNegotiate, and by making sure that generic_ip_connect is always called at least once in cifs_reconnect. Note that this is not a perfect fix for this issue. It's still possible that the NEGOTIATE reply is handled after the socket has been closed and reconnected. In that case, the socket state will look correct but it no NEGOTIATE was performed on it be for the wrong socket. In that situation though the server should just shut down the socket on the next attempted send, rather than causing the oops that occurs today. Cc: # .38.x: fd88ce9: [CIFS] cifs: clarify the meaning of tcpStatus == CifsGood Reported-and-Tested-by: Ben Greear Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9190018f97e1..fa73e2a51cf2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -152,7 +152,7 @@ cifs_reconnect(struct TCP_Server_Info *server) mid_entry->callback(mid_entry); } - while (server->tcpStatus == CifsNeedReconnect) { + do { try_to_freeze(); /* we should try only the port we connected to before */ @@ -167,7 +167,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); } - } + } while (server->tcpStatus == CifsNeedReconnect); return rc; } @@ -3374,7 +3374,7 @@ int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) } if (rc == 0) { spin_lock(&GlobalMid_Lock); - if (server->tcpStatus != CifsExiting) + if (server->tcpStatus == CifsNeedNegotiate) server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; -- cgit v1.2.3 From 3e715513643f0207c8f3c22010b54954cd697474 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 13 Jun 2011 11:50:41 -0400 Subject: cifs: show sec= option in /proc/mounts Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 989442dcfb45..e9def996e383 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -352,6 +352,37 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) } } +static void +cifs_show_security(struct seq_file *s, struct TCP_Server_Info *server) +{ + seq_printf(s, ",sec="); + + switch (server->secType) { + case LANMAN: + seq_printf(s, "lanman"); + break; + case NTLMv2: + seq_printf(s, "ntlmv2"); + break; + case NTLM: + seq_printf(s, "ntlm"); + break; + case Kerberos: + seq_printf(s, "krb5"); + break; + case RawNTLMSSP: + seq_printf(s, "ntlmssp"); + break; + default: + /* shouldn't ever happen */ + seq_printf(s, "unknown"); + break; + } + + if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + seq_printf(s, "i"); +} + /* * cifs_show_options() is for displaying mount options in /proc/mounts. * Not all settable options are displayed but most of the important @@ -365,6 +396,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) struct sockaddr *srcaddr; srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr; + cifs_show_security(s, tcon->ses->server); + seq_printf(s, ",unc=%s", tcon->treeName); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) -- cgit v1.2.3 From 8d1bca328b7c17af33bcf966d799c556ecbf370f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sat, 11 Jun 2011 21:17:10 -0400 Subject: cifs: correctly handle NULL tcon pointer in CIFSTCon Long ago (in commit 00e485b0), I added some code to handle share-level passwords in CIFSTCon. That code ignored the fact that it's legit to pass in a NULL tcon pointer when connecting to the IPC$ share on the server. This wasn't really a problem until recently as we only called CIFSTCon this way when the server returned -EREMOTE. With the introduction of commit c1508ca2 however, it gets called this way on every mount, causing an oops when share-level security is in effect. Fix this by simply treating a NULL tcon pointer as if user-level security were in effect. I'm not aware of any servers that protect the IPC$ share with a specific password anyway. Also, add a comment to the top of CIFSTCon to ensure that we don't make the same mistake again. Cc: Reported-by: Martijn Uffing Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index fa73e2a51cf2..12cf72dd0c42 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3174,6 +3174,10 @@ out: return rc; } +/* + * Issue a TREE_CONNECT request. Note that for IPC$ shares, that the tcon + * pointer may be NULL. + */ int CIFSTCon(unsigned int xid, struct cifs_ses *ses, const char *tree, struct cifs_tcon *tcon, @@ -3208,7 +3212,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, pSMB->AndXCommand = 0xFF; pSMB->Flags = cpu_to_le16(TCON_EXTENDED_SECINFO); bcc_ptr = &pSMB->Password[0]; - if ((ses->server->sec_mode) & SECMODE_USER) { + if (!tcon || (ses->server->sec_mode & SECMODE_USER)) { pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ *bcc_ptr = 0; /* password is null byte */ bcc_ptr++; /* skip password */ -- cgit v1.2.3 From 1123d93963cbd2546449d4d9f0c568e323cb0ac6 Mon Sep 17 00:00:00 2001 From: Max Asbock Date: Mon, 13 Jun 2011 10:18:32 -0700 Subject: timerfd: Fix wakeup of processes when timer is cancelled on clock change Currently processes waiting with poll on cancelable timerfd timers are not woken up when the timers are canceled. When the system time is set the clock_was_set() function calls timerfd_clock_was_set() to cancel and wake up processes waiting on potential cancelable timerfd timers. However the wake up currently has no effect because in the case of timerfd_read it is dependent on ctx->ticks not being 0. timerfd_poll also requires ctx->ticks being non zero. As a consequence processes waiting on cancelable timers only get woken up when the timers expire. This patch fixes this by incrementing ctx->ticks before calling wake_up. Signed-off-by: Max Asbock Cc: kay.sievers@vrfy.org Cc: virtuoso@slind.org Cc: johnstul Link: http://lkml.kernel.org/r/1307985512.4710.41.camel@w-amax.beaverton.ibm.com Signed-off-by: Thomas Gleixner --- fs/timerfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/timerfd.c b/fs/timerfd.c index f67acbdda5e8..dffeb3795af1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -61,7 +61,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) /* * Called when the clock was set to cancel the timers in the cancel - * list. + * list. This will wake up processes waiting on these timers. The + * wake-up requires ctx->ticks to be non zero, therefore we increment + * it before calling wake_up_locked(). */ void timerfd_clock_was_set(void) { @@ -76,6 +78,7 @@ void timerfd_clock_was_set(void) spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->moffs.tv64 != moffs.tv64) { ctx->moffs.tv64 = KTIME_MAX; + ctx->ticks++; wake_up_locked(&ctx->wqh); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); -- cgit v1.2.3 From 040d15c86747cf44fcf6b8ee19d805d4ef20caf3 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 14 Jun 2011 15:51:18 +0000 Subject: [CIFS] trivial cleanup fscache cFYI and cERROR messages ... for uniformity and cleaner debug logs. Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/cache.c | 6 +++--- fs/cifs/fscache.c | 51 ++++++++++++++++++++++++--------------------------- 2 files changed, 27 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index dd8584d35a14..545509c3313b 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -92,7 +92,7 @@ static uint16_t cifs_server_get_key(const void *cookie_netfs_data, break; default: - cERROR(1, "CIFS: Unknown network family '%d'", sa->sa_family); + cERROR(1, "Unknown network family '%d'", sa->sa_family); key_len = 0; break; } @@ -152,7 +152,7 @@ static uint16_t cifs_super_get_key(const void *cookie_netfs_data, void *buffer, sharename = extract_sharename(tcon->treeName); if (IS_ERR(sharename)) { - cFYI(1, "CIFS: couldn't extract sharename\n"); + cFYI(1, "%s: couldn't extract sharename\n", __func__); sharename = NULL; return 0; } @@ -302,7 +302,7 @@ static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) pagevec_init(&pvec, 0); first = 0; - cFYI(1, "cifs inode 0x%p now uncached", cifsi); + cFYI(1, "%s: cifs inode 0x%p now uncached", __func__, cifsi); for (;;) { nr_pages = pagevec_lookup(&pvec, diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index d368a47ba5eb..816696621ec9 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -28,14 +28,14 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server) server->fscache = fscache_acquire_cookie(cifs_fscache_netfs.primary_index, &cifs_fscache_server_index_def, server); - cFYI(1, "CIFS: get client cookie (0x%p/0x%p)", server, - server->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server, + server->fscache); } void cifs_fscache_release_client_cookie(struct TCP_Server_Info *server) { - cFYI(1, "CIFS: release client cookie (0x%p/0x%p)", server, - server->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server, + server->fscache); fscache_relinquish_cookie(server->fscache, 0); server->fscache = NULL; } @@ -47,13 +47,13 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon) tcon->fscache = fscache_acquire_cookie(server->fscache, &cifs_fscache_super_index_def, tcon); - cFYI(1, "CIFS: get superblock cookie (0x%p/0x%p)", - server->fscache, tcon->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, server->fscache, + tcon->fscache); } void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) { - cFYI(1, "CIFS: releasing superblock cookie (0x%p)", tcon->fscache); + cFYI(1, "%s: (0x%p)", __func__, tcon->fscache); fscache_relinquish_cookie(tcon->fscache, 0); tcon->fscache = NULL; } @@ -70,8 +70,8 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { cifsi->fscache = fscache_acquire_cookie(tcon->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, - cifsi->fscache); + cFYI(1, "%s: got FH cookie (0x%p/0x%p)", __func__, + tcon->fscache, cifsi->fscache); } } @@ -80,8 +80,7 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "CIFS releasing inode cookie (0x%p)", - cifsi->fscache); + cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); fscache_relinquish_cookie(cifsi->fscache, 0); cifsi->fscache = NULL; } @@ -92,8 +91,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { - cFYI(1, "CIFS disabling inode cookie (0x%p)", - cifsi->fscache); + cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); fscache_relinquish_cookie(cifsi->fscache, 1); cifsi->fscache = NULL; } @@ -121,8 +119,8 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode) cifs_sb_master_tcon(cifs_sb)->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: new cookie 0x%p oldcookie 0x%p", - cifsi->fscache, old); + cFYI(1, "%s: new cookie 0x%p oldcookie 0x%p", + __func__, cifsi->fscache, old); } } @@ -132,8 +130,8 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) struct inode *inode = page->mapping->host; struct cifsInodeInfo *cifsi = CIFS_I(inode); - cFYI(1, "CIFS: fscache release page (0x%p/0x%p)", - page, cifsi->fscache); + cFYI(1, "%s: (0x%p/0x%p)", __func__, page, + cifsi->fscache); if (!fscache_maybe_release_page(cifsi->fscache, page, gfp)) return 0; } @@ -144,8 +142,7 @@ int cifs_fscache_release_page(struct page *page, gfp_t gfp) static void cifs_readpage_from_fscache_complete(struct page *page, void *ctx, int error) { - cFYI(1, "CFS: readpage_from_fscache_complete (0x%p/%d)", - page, error); + cFYI(1, "%s: (0x%p/%d)", __func__, page, error); if (!error) SetPageUptodate(page); unlock_page(page); @@ -158,7 +155,7 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "CIFS: readpage_from_fscache(fsc:%p, p:%p, i:0x%p", + cFYI(1, "%s: (fsc:%p, p:%p, i:0x%p", __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_read_or_alloc_page(CIFS_I(inode)->fscache, page, cifs_readpage_from_fscache_complete, @@ -167,11 +164,11 @@ int __cifs_readpage_from_fscache(struct inode *inode, struct page *page) switch (ret) { case 0: /* page found in fscache, read submitted */ - cFYI(1, "CIFS: readpage_from_fscache: submitted"); + cFYI(1, "%s: submitted", __func__); return ret; case -ENOBUFS: /* page won't be cached */ case -ENODATA: /* page not in cache */ - cFYI(1, "CIFS: readpage_from_fscache %d", ret); + cFYI(1, "%s: %d", __func__, ret); return 1; default: @@ -190,7 +187,7 @@ int __cifs_readpages_from_fscache(struct inode *inode, { int ret; - cFYI(1, "CIFS: __cifs_readpages_from_fscache (0x%p/%u/0x%p)", + cFYI(1, "%s: (0x%p/%u/0x%p)", __func__, CIFS_I(inode)->fscache, *nr_pages, inode); ret = fscache_read_or_alloc_pages(CIFS_I(inode)->fscache, mapping, pages, nr_pages, @@ -199,12 +196,12 @@ int __cifs_readpages_from_fscache(struct inode *inode, mapping_gfp_mask(mapping)); switch (ret) { case 0: /* read submitted to the cache for all pages */ - cFYI(1, "CIFS: readpages_from_fscache: submitted"); + cFYI(1, "%s: submitted", __func__); return ret; case -ENOBUFS: /* some pages are not cached and can't be */ case -ENODATA: /* some pages are not cached */ - cFYI(1, "CIFS: readpages_from_fscache: no page"); + cFYI(1, "%s: no page", __func__); return 1; default: @@ -218,7 +215,7 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) { int ret; - cFYI(1, "CIFS: readpage_to_fscache(fsc: %p, p: %p, i: %p", + cFYI(1, "%s: (fsc: %p, p: %p, i: %p)", __func__, CIFS_I(inode)->fscache, page, inode); ret = fscache_write_page(CIFS_I(inode)->fscache, page, GFP_KERNEL); if (ret != 0) @@ -230,7 +227,7 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) struct cifsInodeInfo *cifsi = CIFS_I(inode); struct fscache_cookie *cookie = cifsi->fscache; - cFYI(1, "CIFS: fscache invalidatepage (0x%p/0x%p)", page, cookie); + cFYI(1, "%s: (0x%p/0x%p)", __func__, page, cookie); fscache_wait_on_page_write(cookie, page); fscache_uncache_page(cookie, page); } -- cgit v1.2.3 From c46a131c0c0f4c2457e6b1e430c578a5cb057334 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 5 Jun 2011 11:12:31 +0000 Subject: xfs: fix ->mknod() return value on xfs_get_acl() failure ->mknod() should return negative on errors and PTR_ERR() gives already negative value... Signed-off-by: Al Viro Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_iops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index dd21784525a8..d44d92cd12b1 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -182,7 +182,7 @@ xfs_vn_mknod( if (IS_POSIXACL(dir)) { default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(default_acl)) - return -PTR_ERR(default_acl); + return PTR_ERR(default_acl); if (!default_acl) mode &= ~current_umask(); -- cgit v1.2.3 From 1252b3013b790c77e1c4f077a40542f86df37fb4 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 14 Jun 2011 16:19:54 +0000 Subject: [CIFS] update cifs version to 1.73 Signed-off-by: Steve French --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 64313f778ebf..0900e1658c96 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -129,5 +129,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.72" +#define CIFS_VERSION "1.73" #endif /* _CIFSFS_H */ -- cgit v1.2.3 From 71d7aed014457147e8f71a843d5fbf03235e4a85 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 14:24:32 -0400 Subject: Btrfs: fix path leakage on subvol deletion The delayed ref patch accidently removed the btrfs_free_path in btrfs_unlink_subvol, this puts it back and means we don't leak a path. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c15636b17874..5813dec5101c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3076,6 +3076,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); + btrfs_free_path(path); return 0; } -- cgit v1.2.3 From 8351583e3f6e430ce8f71913909a96ad5cc6a2f6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 15:16:14 -0400 Subject: Btrfs: protect the pending_snapshots list with trans_lock Currently there is nothing protecting the pending_snapshots list on the transaction. We only hold the directory mutex that we are snapshotting and a read lock on the subvol_sem, so we could race with somebody else creating a snapshot in a different directory and end up with list corruption. So protect this list with the trans_lock. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b793d112d1f6..a3c4751e07db 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -482,8 +482,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); + spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); + spin_unlock(&root->fs_info->trans_lock); if (async_transid) { *async_transid = trans->transid; ret = btrfs_commit_transaction_async(trans, -- cgit v1.2.3 From ed0ca14021e5ae3147602128641aa7f742ab227c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 14 Jun 2011 16:22:15 -0400 Subject: Btrfs: set no_trans_join after trying to expand the transaction We can lockup if we try to allow new writers join the transaction and we have flushoncommit set or have a pending snapshot. This is because we set no_trans_join and then loop around and try to wait for ordered extents again. The problem is the ordered endio stuff needs to join the transaction, which it can't do because no_trans_join is set. So instead wait until after this loop to set no_trans_join and then make sure to wait for num_writers == 1 in case anybody got started in between us exiting the loop and setting no_trans_join. This could easily be reproduced by mounting -o flushoncommit and running xfstest 13. It cannot be reproduced with this patch. Thanks, Reported-by: Jim Schutt Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b3590b9fe98..56695595e036 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1241,12 +1241,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, schedule_timeout(1); finish_wait(&cur_trans->writer_wait, &wait); - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; - spin_unlock(&root->fs_info->trans_lock); } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * no_join so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); -- cgit v1.2.3 From 793925334f32e9026c22baee5c3c340f47d4ef7e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 15 Jun 2011 12:47:04 -0700 Subject: proc: Fix Oops on stat of /proc//ns/net Don't call iput with the inode half setup to be a namespace filedescriptor. Instead rearrange the code so that we don't initialize ei->ns_ops until after I ns_ops->get succeeds, preventing us from invoking ns_ops->put when ns_ops->get failed. Reported-by: Ingo Saitz Signed-off-by: Eric W. Biederman --- fs/proc/namespaces.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 781dec5bd682..be177f702acb 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -38,18 +38,21 @@ static struct dentry *proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; struct dentry *error = ERR_PTR(-ENOENT); + void *ns; inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; + ns = ns_ops->get(task); + if (!ns) + goto out_iput; + ei = PROC_I(inode); inode->i_mode = S_IFREG|S_IRUSR; inode->i_fop = &ns_file_operations; ei->ns_ops = ns_ops; - ei->ns = ns_ops->get(task); - if (!ei->ns) - goto out_iput; + ei->ns = ns; dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); -- cgit v1.2.3 From 7f81c8890c15a10f5220bebae3b6dfae4961962a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Jun 2011 15:08:11 -0700 Subject: fs/exec.c: use BUILD_BUG_ON for VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP Commit a8bef8ff6ea1 ("mm: migration: avoid race between shift_arg_pages() and rmap_walk() during migration by not migrating temporary stacks") introduced a BUG_ON() to ensure that VM_STACK_FLAGS and VM_STACK_INCOMPLETE_SETUP do not overlap. The check is a compile time one, so BUILD_BUG_ON is more appropriate. Signed-off-by: Michal Hocko Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 97e0d52d72fd..b54f74f3cd80 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -277,7 +277,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; -- cgit v1.2.3 From 13fca640bb8ab611a50e0ba120b186faa2994d6c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 15 Jun 2011 21:53:52 -0700 Subject: Revert "fs/exec.c: use BUILD_BUG_ON for VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP" This reverts commit 7f81c8890c15a10f5220bebae3b6dfae4961962a. It turns out that it's not actually a build-time check on x86-64 UML, which does some seriously crazy stuff with VM_STACK_FLAGS. The VM_STACK_FLAGS define depends on the arch-supplied VM_STACK_DEFAULT_FLAGS value, and on x86-64 UML we have arch/um/sys-x86_64/shared/sysdep/vm-flags.h: #define VM_STACK_DEFAULT_FLAGS \ (test_thread_flag(TIF_IA32) ? vm_stack_flags32 : vm_stack_flags) #define VM_STACK_DEFAULT_FLAGS vm_stack_flags (yes, seriously: two different #define's for that thing, with the first one being inside an "#ifdef TIF_IA32") It's possible that it is UML that should just be fixed in this area, but for now let's just undo the (very small) optimization. Reported-by: Randy Dunlap Acked-by: Andrew Morton Cc: Michal Hocko Cc: Richard Weinberger Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index b54f74f3cd80..97e0d52d72fd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -277,7 +277,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); + BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; -- cgit v1.2.3 From 50338b889dc504c69e0cb316ac92d1b9e51f3c8a Mon Sep 17 00:00:00 2001 From: TΓΆrΓΆk Edwin Date: Thu, 16 Jun 2011 00:06:14 +0300 Subject: fix wrong iput on d_inode introduced by e6bc45d65d MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Git bisection shows that commit e6bc45d65df8599fdbae73be9cec4ceed274db53 causes BUG_ONs under high I/O load: kernel BUG at fs/inode.c:1368! [ 2862.501007] Call Trace: [ 2862.501007] [] d_kill+0xf8/0x140 [ 2862.501007] [] dput+0xc9/0x190 [ 2862.501007] [] fput+0x15f/0x210 [ 2862.501007] [] filp_close+0x61/0x90 [ 2862.501007] [] sys_close+0xb1/0x110 [ 2862.501007] [] system_call_fastpath+0x16/0x1b A reliable way to reproduce this bug is: Login to KDE, run 'rsnapshot sync', and apt-get install openjdk-6-jdk, and apt-get remove openjdk-6-jdk. The buggy part of the patch is this: struct inode *inode = NULL; ..... - if (nd.last.name[nd.last.len]) - goto slashes; inode = dentry->d_inode; - if (inode) - ihold(inode); + if (nd.last.name[nd.last.len] || !inode) + goto slashes; + ihold(inode) ... if (inode) iput(inode); /* truncate the inode here */ If nd.last.name[nd.last.len] is nonzero (and thus goto slashes branch is taken), and dentry->d_inode is non-NULL, then this code now does an additional iput on the inode, which is wrong. Fix this by only setting the inode variable if nd.last.name[nd.last.len] is 0. Reference: https://lkml.org/lkml/2011/6/15/50 Reported-by: Norbert Preining Reported-by: TΓΆrΓΆk Edwin Cc: "Theodore Ts'o" Cc: Al Viro Signed-off-by: TΓΆrΓΆk Edwin Signed-off-by: Al Viro --- fs/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9802345df5e7..6301963b161f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2713,8 +2713,10 @@ static long do_unlinkat(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ + if (nd.last.name[nd.last.len]) + goto slashes; inode = dentry->d_inode; - if (nd.last.name[nd.last.len] || !inode) + if (!inode) goto slashes; ihold(inode); error = mnt_want_write(nd.path.mnt); -- cgit v1.2.3 From 8aef18845266f5c05904c610088f2d1ed58f6be3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Jun 2011 15:10:06 +0100 Subject: VFS: Fix vfsmount overput on simultaneous automount [Kudos to dhowells for tracking that crap down] If two processes attempt to cause automounting on the same mountpoint at the same time, the vfsmount holding the mountpoint will be left with one too few references on it, causing a BUG when the kernel tries to clean up. The problem is that lock_mount() drops the caller's reference to the mountpoint's vfsmount in the case where it finds something already mounted on the mountpoint as it transits to the mounted filesystem and replaces path->mnt with the new mountpoint vfsmount. During a pathwalk, however, we don't take a reference on the vfsmount if it is the same as the one in the nameidata struct, but do_add_mount() doesn't know this. The fix is to make sure we have a ref on the vfsmount of the mountpoint before calling do_add_mount(). However, if lock_mount() doesn't transit, we're then left with an extra ref on the mountpoint vfsmount which needs releasing. We can handle that in follow_managed() by not making assumptions about what we can and what we cannot get from lookup_mnt() as the current code does. The callers of follow_managed() expect that reference to path->mnt will be grabbed iff path->mnt has been changed. follow_managed() and follow_automount() keep track of whether such reference has been grabbed and assume that it'll happen in those and only those cases that'll have us return with changed path->mnt. That assumption is almost correct - it breaks in case of racing automounts and in even harder to hit race between following a mountpoint and a couple of mount --move. The thing is, we don't need to make that assumption at all - after the end of loop in follow_manage() we can check if path->mnt has ended up unchanged and do mntput() if needed. The BUG can be reproduced with the following test program: #include #include #include #include #include int main(int argc, char **argv) { int pid, ws; struct stat buf; pid = fork(); stat(argv[1], &buf); if (pid > 0) wait(&ws); return 0; } and the following procedure: (1) Mount an NFS volume that on the server has something else mounted on a subdirectory. For instance, I can mount / from my server: mount warthog:/ /mnt -t nfs4 -r On the server /data has another filesystem mounted on it, so NFS will see a change in FSID as it walks down the path, and will mark /mnt/data as being a mountpoint. This will cause the automount code to be triggered. !!! Do not look inside the mounted fs at this point !!! (2) Run the above program on a file within the submount to generate two simultaneous automount requests: /tmp/forkstat /mnt/data/testfile (3) Unmount the automounted submount: umount /mnt/data (4) Unmount the original mount: umount /mnt At this point the kernel should throw a BUG with something like the following: BUG: Dentry ffff880032e3c5c0{i=2,n=} still in use (1) [unmount of nfs4 0:12] Note that the bug appears on the root dentry of the original mount, not the mountpoint and not the submount because sys_umount() hasn't got to its final mntput_no_expire() yet, but this isn't so obvious from the call trace: [] shrink_dcache_for_umount+0x69/0x82 [] generic_shutdown_super+0x37/0x15b [] ? nfs_super_return_all_delegations+0x2e/0x1b1 [nfs] [] kill_anon_super+0x1d/0x7e [] nfs4_kill_super+0x60/0xb6 [nfs] [] deactivate_locked_super+0x34/0x83 [] deactivate_super+0x6f/0x7b [] mntput_no_expire+0x18d/0x199 [] mntput+0x3b/0x44 [] release_mounts+0xa2/0xbf [] sys_umount+0x47a/0x4ba [] ? trace_hardirqs_on_caller+0x1fd/0x22f [] system_call_fastpath+0x16/0x1b as do_umount() is inlined. However, you can see release_mounts() in there. Note also that it may be necessary to have multiple CPU cores to be able to trigger this bug. Tested-by: Jeff Layton Tested-by: Ian Kent Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/namei.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 6301963b161f..9e425e7e6c8f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -812,6 +812,11 @@ static int follow_automount(struct path *path, unsigned flags, if (!mnt) /* mount collision */ return 0; + if (!*need_mntput) { + /* lock_mount() may release path->mnt on error */ + mntget(path->mnt); + *need_mntput = true; + } err = finish_automount(mnt, path); switch (err) { @@ -819,12 +824,9 @@ static int follow_automount(struct path *path, unsigned flags, /* Someone else made a mount here whilst we were busy */ return 0; case 0: - dput(path->dentry); - if (*need_mntput) - mntput(path->mnt); + path_put(path); path->mnt = mnt; path->dentry = dget(mnt->mnt_root); - *need_mntput = true; return 0; default: return err; @@ -844,9 +846,10 @@ static int follow_automount(struct path *path, unsigned flags, */ static int follow_managed(struct path *path, unsigned flags) { + struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; bool need_mntput = false; - int ret; + int ret = 0; /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see @@ -861,7 +864,7 @@ static int follow_managed(struct path *path, unsigned flags) BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path->dentry, false); if (ret < 0) - return ret == -EISDIR ? 0 : ret; + break; } /* Transit to a mounted filesystem. */ @@ -887,14 +890,19 @@ static int follow_managed(struct path *path, unsigned flags) if (managed & DCACHE_NEED_AUTOMOUNT) { ret = follow_automount(path, flags, &need_mntput); if (ret < 0) - return ret == -EISDIR ? 0 : ret; + break; continue; } /* We didn't change the current path point */ break; } - return 0; + + if (need_mntput && path->mnt == mnt) + mntput(path->mnt); + if (ret == -EISDIR) + ret = 0; + return ret; } int follow_down_one(struct path *path) -- cgit v1.2.3 From 5e7f23373bf9a853e9256e81e86724cdd0a33c29 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jun 2011 22:31:12 +0100 Subject: afs: afs_fill_page reads too much, or wrong data afs_fill_page should read the page that is about to be written but the current implementation has a number of issues. If we aren't extending the file we always read PAGE_CACHE_SIZE at offset 0. If we are extending the file we try to read the entire file. Change afs_fill_page to read PAGE_CACHE_SIZE at the right offset, clamped to i_size. While here, avoid calling afs_fill_page when we are doing a PAGE_CACHE_SIZE write. Signed-off-by: Anton Blanchard Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/write.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/afs/write.c b/fs/afs/write.c index 789b3afb3423..b806285ff853 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,23 +84,21 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, unsigned len, struct page *page) + loff_t pos, struct page *page) { loff_t i_size; - unsigned eof; int ret; + int len; - _enter(",,%llu,%u", (unsigned long long)pos, len); - - ASSERTCMP(len, <=, PAGE_CACHE_SIZE); + _enter(",,%llu", (unsigned long long)pos); i_size = i_size_read(&vnode->vfs_inode); - if (pos + len > i_size) - eof = i_size; + if (pos + PAGE_CACHE_SIZE > i_size) + len = i_size - pos; else - eof = PAGE_CACHE_SIZE; + len = PAGE_CACHE_SIZE; - ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); + ret = afs_vnode_fetch_data(vnode, key, pos, len, page); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -153,9 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; /* page won't leak in error case: it eventually gets cleaned off LRU */ - if (!PageUptodate(page)) { - _debug("not up to date"); - ret = afs_fill_page(vnode, key, pos, len, page); + if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { + ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); -- cgit v1.2.3 From f9f07b6c1372b1436aa6b45333445b443ffd8c95 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 14 Jun 2011 00:58:27 +0200 Subject: vfs: Fix data corruption after failed write in __block_write_begin() I've got a report of a file corruption from fsxlinux on ext3. The important operations to the page were: mapwrite to a hole partial write to the page read - found the page zeroed from the end of the normal write The culprit seems to be that if get_block() fails in __block_write_begin() (e.g. transient ENOSPC in ext3), the function does ClearPageUptodate(page). Thus when we retry the write, the logic in __block_write_begin() thinks zeroing of the page is needed and overwrites old data. In fact, I don't see why we should ever need to zero the uptodate bit here - either the page was uptodate when we entered __block_write_begin() and it should stay so when we leave it, or it was not uptodate and noone had right to set it uptodate during __block_write_begin() so it remains !uptodate when we leave as well. So just remove clearing of the bit. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/buffer.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 49c9aada0374..1a80b048ade8 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1902,10 +1902,8 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (unlikely(err)) { + if (unlikely(err)) page_zero_new_buffers(page, from, to); - ClearPageUptodate(page); - } return err; } EXPORT_SYMBOL(__block_write_begin); -- cgit v1.2.3 From 2e41ae225f742ded5b7d9847cd8bd605f27daba8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jun 2011 00:38:44 +0100 Subject: AFS: Set s_id in the superblock to the volume name Set s_id in the superblock to the name of the AFS volume that this superblock corresponds to. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/afs/super.c b/fs/afs/super.c index b7d48d7eaa1e..356dcf0929e8 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -313,6 +313,7 @@ static int afs_fill_super(struct super_block *sb, sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; sb->s_bdi = &as->volume->bdi; + strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); /* allocate the root inode and dentry */ fid.vid = as->volume->vid; -- cgit v1.2.3 From d6e43f751f252c68ca69fa6d18665d88d69ef8b7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 14 Jun 2011 00:45:44 +0100 Subject: AFS: Use i_generation not i_version for the vnode uniquifier Store the AFS vnode uniquifier in the i_generation field, not the i_version field of the inode struct. i_version can then be given the AFS data version number. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/afs/dir.c | 8 ++++---- fs/afs/fsclient.c | 3 ++- fs/afs/inode.c | 10 +++++----- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 20c106f24927..1b0b19550015 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -584,11 +584,11 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, success: d_add(dentry, inode); - _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%llu }", + _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", fid.vnode, fid.unique, dentry->d_inode->i_ino, - (unsigned long long)dentry->d_inode->i_version); + dentry->d_inode->i_generation); return NULL; } @@ -671,10 +671,10 @@ static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd) * been deleted and replaced, and the original vnode ID has * been reused */ if (fid.unique != vnode->fid.unique) { - _debug("%s: file deleted (uq %u -> %u I:%llu)", + _debug("%s: file deleted (uq %u -> %u I:%u)", dentry->d_name.name, fid.unique, vnode->fid.unique, - (unsigned long long)dentry->d_inode->i_version); + dentry->d_inode->i_generation); spin_lock(&vnode->lock); set_bit(AFS_VNODE_DELETED, &vnode->flags); spin_unlock(&vnode->lock); diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 4bd0218473a9..346e3289abd7 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -89,7 +89,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, i_size_write(&vnode->vfs_inode, size); vnode->vfs_inode.i_uid = status->owner; vnode->vfs_inode.i_gid = status->group; - vnode->vfs_inode.i_version = vnode->fid.unique; + vnode->vfs_inode.i_generation = vnode->fid.unique; vnode->vfs_inode.i_nlink = status->nlink; mode = vnode->vfs_inode.i_mode; @@ -102,6 +102,7 @@ static void xdr_decode_AFSFetchStatus(const __be32 **_bp, vnode->vfs_inode.i_ctime.tv_sec = status->mtime_server; vnode->vfs_inode.i_mtime = vnode->vfs_inode.i_ctime; vnode->vfs_inode.i_atime = vnode->vfs_inode.i_ctime; + vnode->vfs_inode.i_version = data_version; } expected_version = status->data_version; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index db66c5201474..0fdab6e03d87 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -75,7 +75,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) inode->i_ctime.tv_nsec = 0; inode->i_atime = inode->i_mtime = inode->i_ctime; inode->i_blocks = 0; - inode->i_version = vnode->fid.unique; + inode->i_generation = vnode->fid.unique; + inode->i_version = vnode->status.data_version; inode->i_mapping->a_ops = &afs_fs_aops; /* check to see whether a symbolic link is really a mountpoint */ @@ -100,7 +101,7 @@ static int afs_iget5_test(struct inode *inode, void *opaque) struct afs_iget_data *data = opaque; return inode->i_ino == data->fid.vnode && - inode->i_version == data->fid.unique; + inode->i_generation == data->fid.unique; } /* @@ -122,7 +123,7 @@ static int afs_iget5_set(struct inode *inode, void *opaque) struct afs_vnode *vnode = AFS_FS_I(inode); inode->i_ino = data->fid.vnode; - inode->i_version = data->fid.unique; + inode->i_generation = data->fid.unique; vnode->fid = data->fid; vnode->volume = data->volume; @@ -380,8 +381,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, inode = dentry->d_inode; - _enter("{ ino=%lu v=%llu }", inode->i_ino, - (unsigned long long)inode->i_version); + _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); generic_fillattr(inode, stat); return 0; -- cgit v1.2.3 From a27a263bae072a499acc77b632238a6dacccf888 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 16 Jun 2011 12:02:23 +0000 Subject: xfs: make log devices with write back caches work There's no reason not to support cache flushing on external log devices. The only thing this really requires is flushing the data device first both in fsync and log commits. A side effect is that we also have to remove the barrier write test during mount, which has been superflous since the new FLUSH+FUA code anyway. Also use the chance to flush the RT subvolume write cache before the fsync commit, which is required for correct semantics. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_file.c | 50 ++++++++++++++++++----------- fs/xfs/linux-2.6/xfs_super.c | 75 -------------------------------------------- fs/xfs/xfs_log.c | 11 ++++++- 3 files changed, 41 insertions(+), 95 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index f4213ba1ff85..7f782af286bf 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -131,19 +131,34 @@ xfs_file_fsync( { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = 0; int log_flushed = 0; trace_xfs_file_fsync(ip); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); xfs_iflags_clear(ip, XFS_ITRUNCATED); xfs_ioend_wait(ip); + if (mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an RT and/or log subvolume we need to make sure + * to flush the write cache the device used for file data + * first. This is to ensure newly written file data make + * it to disk before logging the new inode size in case of + * an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + } + /* * We always need to make sure that the required inode state is safe on * disk. The inode might be clean but we still might need to force the @@ -175,9 +190,9 @@ xfs_file_fsync( * updates. The sync transaction will also force the log. */ xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0); + XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); if (error) { xfs_trans_cancel(tp, 0); return -error; @@ -209,28 +224,25 @@ xfs_file_fsync( * force the log. */ if (xfs_ipincount(ip)) { - error = _xfs_log_force_lsn(ip->i_mount, + error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn, XFS_LOG_SYNC, &log_flushed); } xfs_iunlock(ip, XFS_ILOCK_SHARED); } - if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) { - /* - * If the log write didn't issue an ordered tag we need - * to flush the disk cache for the data device now. - */ - if (!log_flushed) - xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); - - /* - * If this inode is on the RT dev we need to flush that - * cache as well. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); - } + /* + * If we only have a single device, and the log force about was + * a no-op we might have to flush the data device cache here. + * This can only happen for fdatasync/O_DSYNC if we were overwriting + * an already allocated file and thus do not have any metadata to + * commit. + */ + if ((mp->m_flags & XFS_MOUNT_BARRIER) && + mp->m_logdev_targp == mp->m_ddev_targp && + !XFS_IS_REALTIME_INODE(ip) && + !log_flushed) + xfs_blkdev_issue_flush(mp->m_ddev_targp); return -error; } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1e3a7ce804dc..a1a881e68a9a 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -627,68 +627,6 @@ xfs_blkdev_put( blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } -/* - * Try to write out the superblock using barriers. - */ -STATIC int -xfs_barrier_test( - xfs_mount_t *mp) -{ - xfs_buf_t *sbp = xfs_getsb(mp, 0); - int error; - - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - XFS_BUF_ORDERED(sbp); - - xfsbdstrat(mp, sbp); - error = xfs_buf_iowait(sbp); - - /* - * Clear all the flags we set and possible error state in the - * buffer. We only did the write to try out whether barriers - * worked and shouldn't leave any traces in the superblock - * buffer. - */ - XFS_BUF_DONE(sbp); - XFS_BUF_ERROR(sbp, 0); - XFS_BUF_UNORDERED(sbp); - - xfs_buf_relse(sbp); - return error; -} - -STATIC void -xfs_mountfs_check_barriers(xfs_mount_t *mp) -{ - int error; - - if (mp->m_logdev_targp != mp->m_ddev_targp) { - xfs_notice(mp, - "Disabling barriers, not supported with external log device"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - if (xfs_readonly_buftarg(mp->m_ddev_targp)) { - xfs_notice(mp, - "Disabling barriers, underlying device is readonly"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - error = xfs_barrier_test(mp); - if (error) { - xfs_notice(mp, - "Disabling barriers, trial barrier write failed"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } -} - void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) @@ -1240,14 +1178,6 @@ xfs_fs_remount( switch (token) { case Opt_barrier: mp->m_flags |= XFS_MOUNT_BARRIER; - - /* - * Test if barriers are actually working if we can, - * else delay this check until the filesystem is - * marked writeable. - */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) - xfs_mountfs_check_barriers(mp); break; case Opt_nobarrier: mp->m_flags &= ~XFS_MOUNT_BARRIER; @@ -1282,8 +1212,6 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { mp->m_flags &= ~XFS_MOUNT_RDONLY; - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); /* * If this is the first remount to writeable state we @@ -1465,9 +1393,6 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); - error = xfs_filestream_mount(mp); if (error) goto out_free_sb; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 211930246f20..41d5b8f2bf92 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1372,8 +1372,17 @@ xlog_sync(xlog_t *log, XFS_BUF_ASYNC(bp); bp->b_flags |= XBF_LOG_BUFFER; - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an external log device, flush the data device + * before flushing the log to make sure all meta data + * written back from the AIL actually made it to disk + * before writing out the new log tail LSN in the log buffer. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); XFS_BUF_ORDERED(bp); + } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); -- cgit v1.2.3 From 879669961b11e7f40b518784863a259f735a72bf Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 17 Jun 2011 11:25:59 +0100 Subject: KEYS/DNS: Fix ____call_usermodehelper() to not lose the session keyring ____call_usermodehelper() now erases any credentials set by the subprocess_inf::init() function. The problem is that commit 17f60a7da150 ("capabilites: allow the application of capability limits to usermode helpers") creates and commits new credentials with prepare_kernel_cred() after the call to the init() function. This wipes all keyrings after umh_keys_init() is called. The best way to deal with this is to put the init() call just prior to the commit_creds() call, and pass the cred pointer to init(). That means that umh_keys_init() and suchlike can modify the credentials _before_ they are published and potentially in use by the rest of the system. This prevents request_key() from working as it is prevented from passing the session keyring it set up with the authorisation token to /sbin/request-key, and so the latter can't assume the authority to instantiate the key. This causes the in-kernel DNS resolver to fail with ENOKEY unconditionally. Signed-off-by: David Howells Acked-by: Eric Paris Tested-by: Jeff Layton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/kmod.h | 8 ++++---- kernel/kmod.c | 16 +++++++++------- security/keys/request_key.c | 3 +-- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 97e0d52d72fd..6075a1e727ae 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1996,7 +1996,7 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_pipe_setup(struct subprocess_info *info) +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *rp, *wp; struct fdtable *fdt; diff --git a/include/linux/kmod.h b/include/linux/kmod.h index d4a5c84c503d..0da38cf7db7b 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -45,7 +45,7 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; #endif -struct key; +struct cred; struct file; enum umh_wait { @@ -62,7 +62,7 @@ struct subprocess_info { char **envp; enum umh_wait wait; int retval; - int (*init)(struct subprocess_info *info); + int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); void *data; }; @@ -73,7 +73,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, /* Set various pieces of state into the subprocess_info structure */ void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info), + int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data); @@ -87,7 +87,7 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info); static inline int call_usermodehelper_fns(char *path, char **argv, char **envp, enum umh_wait wait, - int (*init)(struct subprocess_info *info), + int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data) { struct subprocess_info *info; diff --git a/kernel/kmod.c b/kernel/kmod.c index ad6a81c58b44..47613dfb7b28 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -156,12 +156,6 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); - if (sub_info->init) { - retval = sub_info->init(sub_info); - if (retval) - goto fail; - } - retval = -ENOMEM; new = prepare_kernel_cred(current); if (!new) @@ -173,6 +167,14 @@ static int ____call_usermodehelper(void *data) new->cap_inheritable); spin_unlock(&umh_sysctl_lock); + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto fail; + } + } + commit_creds(new); retval = kernel_execve(sub_info->path, @@ -388,7 +390,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); * context in which call_usermodehelper_exec is called. */ void call_usermodehelper_setfns(struct subprocess_info *info, - int (*init)(struct subprocess_info *info), + int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *info), void *data) { diff --git a/security/keys/request_key.c b/security/keys/request_key.c index d31862e0aa1c..8e319a416eec 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -71,9 +71,8 @@ EXPORT_SYMBOL(complete_request_key); * This is called in context of freshly forked kthread before kernel_execve(), * so we can simply install the desired session_keyring at this point. */ -static int umh_keys_init(struct subprocess_info *info) +static int umh_keys_init(struct subprocess_info *info, struct cred *cred) { - struct cred *cred = (struct cred*)current_cred(); struct key *keyring = info->data; return install_session_keyring_to_cred(cred, keyring); -- cgit v1.2.3 From 7585717f304f5ed005cc4ad933a69aab3efbd136 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 13 Jun 2011 20:00:16 -0400 Subject: Btrfs: fix relocation races The recent commit to get rid of our trans_mutex introduced some races with block group relocation. The problem is that relocation needs to do some record keeping about each root, and it was relying on the transaction mutex to coordinate things in subtle ways. This fix adds a mutex just for the relocation code and makes sure it doesn't have a big impact on normal operations. The race is really fixed in btrfs_record_root_in_trans, which is where we step back and wait for the relocation code to finish accounting setup. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 14 ++++++++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/relocation.c | 30 ++++++++++++++------- fs/btrfs/transaction.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 105 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8490ee063709..a2c91a102b72 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -967,6 +967,12 @@ struct btrfs_fs_info { struct srcu_struct subvol_srcu; spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -1172,6 +1178,14 @@ struct btrfs_root { u32 type; u64 highest_objectid; + + /* btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So in_trans_setup + * is used to tell us when more checks are required + */ + unsigned long in_trans_setup; int ref_cows; int track_dirty; int in_radix; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20c111b3fa0d..0b2b4b759136 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1620,6 +1620,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f25b10a22a0a..086b1e6b8614 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1368,7 +1368,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!root->reloc_root) - return 0; + goto out; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -1390,6 +1390,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, root->fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); + +out: return 0; } @@ -2142,10 +2144,11 @@ int prepare_to_merge(struct reloc_control *rc, int err) u64 num_bytes = 0; int ret; - spin_lock(&root->fs_info->trans_lock); + mutex_lock(&root->fs_info->reloc_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); + again: if (!err) { num_bytes = rc->merging_rsv_size; @@ -2214,9 +2217,16 @@ int merge_reloc_roots(struct reloc_control *rc) int ret; again: root = rc->extent_root; - spin_lock(&root->fs_info->trans_lock); + + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&root->fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); while (!list_empty(&reloc_roots)) { found = 1; @@ -3590,17 +3600,19 @@ next: static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = rc; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = NULL; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static int check_extent_flags(u64 flags) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b3590b9fe98..833996a0c628 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -126,28 +126,85 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, +static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + /* + * see below for in_trans_setup usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + root->in_trans_setup = 1; + + /* make sure readers find in_trans_setup before + * they find our root->last_trans update + */ + smp_wmb(); + spin_lock(&root->fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } - root->last_trans = trans->transid; radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); spin_unlock(&root->fs_info->fs_roots_radix_lock); + root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root->in_trans_setup. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ btrfs_init_reloc_root(trans, root); + smp_wmb(); + root->in_trans_setup = 0; } return 0; } + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; + + /* + * see record_root_in_trans for comments about in_trans_setup usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !root->in_trans_setup) + return 0; + + mutex_lock(&root->fs_info->reloc_mutex); + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->reloc_mutex); + + return 0; +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -882,7 +939,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent = dget_parent(dentry); parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; - btrfs_record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root); /* * insert the directory item @@ -900,7 +957,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - btrfs_record_root_in_trans(trans, root); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1247,6 +1304,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&root->fs_info->reloc_mutex); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); @@ -1312,6 +1376,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, root->fs_info->running_transaction = NULL; root->fs_info->trans_no_join = 0; spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); wake_up(&root->fs_info->transaction_wait); -- cgit v1.2.3 From 3ed4498caf381a73d6259d3ffacc914b17a507ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 13 Jun 2011 17:54:22 +0000 Subject: btrfs: fix dereference of ERR_PTR value smatch reports: btrfs_recover_log_trees error: 'wc.replay_dest' dereferencing possible ERR_PTR() Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 592396c6dc47..4ce8a9f41d1e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3177,7 +3177,7 @@ again: tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - BUG_ON(!wc.replay_dest); + BUG_ON(IS_ERR_OR_NULL(wc.replay_dest)); wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); -- cgit v1.2.3 From 9fe6a50fb764f508dd2de47a66e62e51388791fb Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 16 Jun 2011 09:04:57 +0000 Subject: btrfs: Remove unused sysfs code Removes code no longer used. The sysfs file itself is kept, because the btrfs developers expressed interest in putting new entries to sysfs. Signed-off-by: Maarten Lankhorst Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 - fs/btrfs/disk-io.c | 1 - fs/btrfs/sysfs.c | 146 ----------------------------------------------------- 3 files changed, 148 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a2c91a102b72..8e948ec1ee6b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1195,7 +1195,6 @@ struct btrfs_root { struct btrfs_key defrag_max; int defrag_running; char *name; - int in_sysfs; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0b2b4b759136..c25ef5a0ccd6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1044,7 +1044,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_trans = 0; root->highest_objectid = 0; root->name = NULL; - root->in_sysfs = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c3c223ae6691..daac9ae6d731 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -28,152 +28,6 @@ #include "disk-io.h" #include "transaction.h" -static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_used(&root->root_item)); -} - -static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_limit(&root->root_item)); -} - -static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) -{ - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); -} - -static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); -} - -static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); -} - -/* this is for root attrs (subvols/snapshots) */ -struct btrfs_root_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_root *, char *); - ssize_t (*store)(struct btrfs_root *, const char *, size_t); -}; - -#define ROOT_ATTR(name, mode, show, store) \ -static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ - show, store) - -ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); -ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); - -static struct attribute *btrfs_root_attrs[] = { - &btrfs_root_attr_blocks_used.attr, - &btrfs_root_attr_block_limit.attr, - NULL, -}; - -/* this is for super attrs (actual full fs) */ -struct btrfs_super_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_fs_info *, char *); - ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); -}; - -#define SUPER_ATTR(name, mode, show, store) \ -static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ - show, store) - -SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); -SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); -SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); - -static struct attribute *btrfs_super_attrs[] = { - &btrfs_super_attr_blocks_used.attr, - &btrfs_super_attr_total_blocks.attr, - &btrfs_super_attr_blocksize.attr, - NULL, -}; - -static ssize_t btrfs_super_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->show ? a->show(fs, buf) : 0; -} - -static ssize_t btrfs_super_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->store ? a->store(fs, buf, len) : 0; -} - -static ssize_t btrfs_root_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - - return a->show ? a->show(root, buf) : 0; -} - -static ssize_t btrfs_root_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - return a->store ? a->store(root, buf, len) : 0; -} - -static void btrfs_super_release(struct kobject *kobj) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - complete(&fs->kobj_unregister); -} - -static void btrfs_root_release(struct kobject *kobj) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - complete(&root->kobj_unregister); -} - -static const struct sysfs_ops btrfs_super_attr_ops = { - .show = btrfs_super_attr_show, - .store = btrfs_super_attr_store, -}; - -static const struct sysfs_ops btrfs_root_attr_ops = { - .show = btrfs_root_attr_show, - .store = btrfs_root_attr_store, -}; - /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; -- cgit v1.2.3 From 19fd294957e426bfdd8e19085096467ec18df5c4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Jun 2011 10:47:30 +0000 Subject: btrfs: fix wrong reservation when doing delayed inode operations We have migrated the space for the delayed inode items from trans_block_rsv to global_block_rsv, but we forgot to set trans->block_rsv to global_block_rsv when we doing delayed inode operations, and the following Oops happened: [ 9792.654889] ------------[ cut here ]------------ [ 9792.654898] WARNING: at fs/btrfs/extent-tree.c:5681 btrfs_alloc_free_block+0xca/0x27c [btrfs]() [ 9792.654899] Hardware name: To Be Filled By O.E.M. [ 9792.654900] Modules linked in: btrfs zlib_deflate libcrc32c ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6_tables arc4 rt61pci rt2x00pci rt2x00lib snd_hda_codec_hdmi mac80211 snd_hda_codec_realtek cfg80211 snd_hda_intel edac_core snd_seq rfkill pcspkr serio_raw snd_hda_codec eeprom_93cx6 edac_mce_amd sp5100_tco i2c_piix4 k10temp snd_hwdep snd_seq_device snd_pcm floppy r8169 xhci_hcd mii snd_timer snd soundcore snd_page_alloc ipv6 firewire_ohci pata_acpi ata_generic firewire_core pata_via crc_itu_t radeon ttm drm_kms_helper drm i2c_algo_bit i2c_core [last unloaded: scsi_wait_scan] [ 9792.654919] Pid: 2762, comm: rm Tainted: G W 2.6.39+ #1 [ 9792.654920] Call Trace: [ 9792.654922] [] warn_slowpath_common+0x83/0x9b [ 9792.654925] [] warn_slowpath_null+0x1a/0x1c [ 9792.654933] [] btrfs_alloc_free_block+0xca/0x27c [btrfs] [ 9792.654945] [] ? map_extent_buffer+0x6e/0xa8 [btrfs] [ 9792.654953] [] __btrfs_cow_block+0xfc/0x30c [btrfs] [ 9792.654963] [] ? btrfs_buffer_uptodate+0x47/0x58 [btrfs] [ 9792.654970] [] ? read_block_for_search+0x94/0x368 [btrfs] [ 9792.654978] [] btrfs_cow_block+0xfe/0x146 [btrfs] [ 9792.654986] [] btrfs_search_slot+0x14d/0x4b6 [btrfs] [ 9792.654997] [] ? map_extent_buffer+0x6e/0xa8 [btrfs] [ 9792.655022] [] btrfs_lookup_inode+0x2f/0x8f [btrfs] [ 9792.655025] [] ? _cond_resched+0xe/0x22 [ 9792.655027] [] ? mutex_lock+0x29/0x50 [ 9792.655039] [] btrfs_update_delayed_inode+0x72/0x137 [btrfs] [ 9792.655051] [] btrfs_run_delayed_items+0x90/0xdb [btrfs] [ 9792.655062] [] btrfs_commit_transaction+0x228/0x654 [btrfs] [ 9792.655064] [] ? remove_wait_queue+0x3a/0x3a [ 9792.655075] [] btrfs_evict_inode+0x14d/0x202 [btrfs] [ 9792.655077] [] evict+0x71/0x111 [ 9792.655079] [] iput+0x12a/0x132 [ 9792.655081] [] do_unlinkat+0x106/0x155 [ 9792.655083] [] ? path_put+0x1f/0x23 [ 9792.655085] [] ? audit_syscall_entry+0x145/0x171 [ 9792.655087] [] ? putname+0x34/0x36 [ 9792.655090] [] sys_unlinkat+0x29/0x2b [ 9792.655092] [] system_call_fastpath+0x16/0x1b [ 9792.655093] ---[ end trace 02b696eb02b3f768 ]--- This patch fix it by setting the reservation of the transaction handle to the correct one. Reported-by: Josef Bacik Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 25 ++++++++++++++++++++----- fs/btrfs/delayed-inode.h | 1 - 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6462c29d2d37..fc515b787e8c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -297,7 +297,6 @@ struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->data_len = data_len; item->ins_or_del = 0; item->bytes_reserved = 0; - item->block_rsv = NULL; item->delayed_node = NULL; atomic_set(&item->refs, 1); } @@ -593,10 +592,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) { + if (!ret) item->bytes_reserved = num_bytes; - item->block_rsv = dst_rsv; - } return ret; } @@ -604,10 +601,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, struct btrfs_delayed_item *item) { + struct btrfs_block_rsv *rsv; + if (!item->bytes_reserved) return; - btrfs_block_rsv_release(root, item->block_rsv, + rsv = &root->fs_info->global_block_rsv; + btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -1014,6 +1014,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret = 0; path = btrfs_alloc_path(); @@ -1021,6 +1022,9 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + delayed_root = btrfs_get_delayed_root(root); curr_node = btrfs_first_delayed_node(delayed_root); @@ -1045,6 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, } btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1052,6 +1057,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; path = btrfs_alloc_path(); @@ -1059,6 +1065,9 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &node->root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, node->root, node); @@ -1066,6 +1075,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = btrfs_update_delayed_inode(trans, node->root, path, node); btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1116,6 +1126,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1134,6 +1145,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) if (IS_ERR(trans)) goto free_path; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, root, @@ -1176,6 +1190,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) nr = trans->blocks_used; + trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); __btrfs_btree_balance_dirty(root, nr); free_path: diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index eb7d240aa648..cb79b6771e82 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -75,7 +75,6 @@ struct btrfs_delayed_item { struct list_head tree_list; /* used for batch insert/delete items */ struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; - struct btrfs_block_rsv *block_rsv; struct btrfs_delayed_node *delayed_node; atomic_t refs; int ins_or_del; -- cgit v1.2.3 From 35a30d7ce54e087d8025a725d4e5a2fdee723a9f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 13 Jun 2011 15:18:23 +0000 Subject: btrfs: fix uninitialized return value When allocation fails in btrfs_read_fs_root_no_name, ret is not set although it is returned, holding a garbage value. Signed-off-by: David Sterba Reviewed-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c25ef5a0ccd6..1ac8db5dc0a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1299,12 +1299,12 @@ again: return root; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - if (!root->free_ino_ctl) - goto fail; root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), GFP_NOFS); - if (!root->free_ino_pinned) + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; goto fail; + } btrfs_init_free_ino_ctl(root); mutex_init(&root->fs_commit_mutex); -- cgit v1.2.3 From e999376f094162aa425ae749aa1df95ab928d010 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 17 Jun 2011 16:14:09 -0400 Subject: Btrfs: avoid delayed metadata items during commits Snapshot creation has two phases. One is the initial snapshot setup, and the second is done during commit, while nobody is allowed to modify the root we are snapshotting. The delayed metadata insertion code can break that rule, it does a delayed inode update on the inode of the parent of the snapshot, and delayed directory item insertion. This makes sure to run the pending delayed operations before we record the snapshot root, which avoids corruptions. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 7 +++++++ fs/btrfs/delayed-inode.h | 4 ++++ fs/btrfs/transaction.c | 27 +++++++++++++++++---------- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index fc515b787e8c..f1cbd028f7b3 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1237,6 +1237,13 @@ again: return 0; } +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index cb79b6771e82..d1a6a2915c66 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -137,4 +137,8 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, /* for init */ int __init btrfs_delayed_inode_init(void); void btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_root *root); + #endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c073d85e14f3..51dcec86757f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -957,6 +957,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans, root); + BUG_ON(ret); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); @@ -1018,14 +1027,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, int ret; list_for_each_entry(pending, head, list) { - /* - * We must deal with the delayed items before creating - * snapshots, or we will create a snapthot with inconsistent - * information. - */ - ret = btrfs_run_delayed_items(trans, fs_info->fs_root); - BUG_ON(ret); - ret = create_pending_snapshot(trans, fs_info, pending); BUG_ON(ret); } @@ -1319,15 +1320,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ mutex_lock(&root->fs_info->reloc_mutex); - ret = create_pending_snapshots(trans, root->fs_info); + ret = btrfs_run_delayed_items(trans, root); BUG_ON(ret); - ret = btrfs_run_delayed_items(trans, root); + ret = create_pending_snapshots(trans, root->fs_info); BUG_ON(ret); ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(root); + WARN_ON(cur_trans != trans->transaction); btrfs_scrub_pause(root); -- cgit v1.2.3 From c11760c6d80ab6aa20e383cf378a7287305f591c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jun 2011 10:30:03 -0700 Subject: isofs: fix bh leak in isofs_fill_super() error case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In isofs_fill_super(), when an iso_primary_descriptor is found, it is kept in pri_bh. The error cases don't properly release it. Fix it. Reported-and-tested-by: 김원석 Cc: Andrew Morton Signed-off-by: Linus Torvalds --- fs/isofs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 3db5ba4568fc..b3cc8586984e 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -974,7 +974,7 @@ out_no_inode: out_no_read: printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", __func__, s->s_id, iso_blknum, block); - goto out_freesbi; + goto out_freebh; out_bad_zone_size: printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", sbi->s_log_zone_size); @@ -989,6 +989,7 @@ out_unknown_format: out_freebh: brelse(bh); + brelse(pri_bh); out_freesbi: kfree(opt.iocharset); kfree(sbi); -- cgit v1.2.3 From 105f4622104848ff1ee1f644d661bef9dec3eb27 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 7 Jun 2011 11:50:23 -0400 Subject: nfsd4: fix break_lease flags on nfsd open Thanks to Casey Bodley for pointing out that on a read open we pass 0, instead of O_RDONLY, to break_lease, with the result that a read open is treated like a write open for the purposes of lease breaking! Reported-by: Casey Bodley Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f3fb61b69a1e..fd0acca5370a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -696,7 +696,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ +static int nfsd_open_break_lease(struct inode *inode, int access) +{ + unsigned int mode; + if (access & NFSD_MAY_NOT_BREAK_LEASE) + return 0; + mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; + return break_lease(inode, mode | O_NONBLOCK); +} /* * Open an existing file or directory. @@ -744,12 +752,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!inode->i_fop) goto out; - /* - * Check to see if there are any leases on this file. - * This may block while leases are broken. - */ - if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) - host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); + host_err = nfsd_open_break_lease(inode, access); if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; -- cgit v1.2.3 From 185bf87393afe6b966881e36c459949d90930a7a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 20 Jun 2011 10:10:24 +0300 Subject: ubifs: dereferencing an ERR_PTR in ubifs_mount() d251ed271d5 "ubifs: fix sget races" left out the goto from this error path so the static checkers complain that we're dereferencing "sb" when it's an ERR_PTR. Signed-off-by: Dan Carpenter Signed-off-by: Al Viro --- fs/ubifs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 8c892c2d5300..529be0582029 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2146,6 +2146,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); + goto out_close; } if (sb->s_root) { -- cgit v1.2.3 From 1712c20dae7b770b62b2e3272100b3b40af0157c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 19:59:04 -0400 Subject: bad_inode_permission() is safe from RCU mode return -EIO; is *not* a blocking operation, thank you very much. Nick, what the hell have you been smoking? Signed-off-by: Al Viro --- fs/bad_inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9ad2369d9e35..bfcb18feb1df 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -231,9 +231,6 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return -EIO; } -- cgit v1.2.3 From ec12781f192568f7ea860f440f890389ba393df7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:03:36 -0400 Subject: cifs_permission() doesn't need to bail out in RCU mode nothing potentially blocking except generic_permission(), which will DTRT Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e9def996e383..2f0c58646c10 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -257,9 +257,6 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags) { struct cifs_sb_info *cifs_sb; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - cifs_sb = CIFS_SB(inode->i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { -- cgit v1.2.3 From 6b419951f1e44c8a46fccfc6551eca9a9980acd6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:11:43 -0400 Subject: coda_ioctl_permission() is safe in RCU mode return (mask & MAY_EXEC) ? -EACCES : 0; is non-blocking... Signed-off-by: Al Viro --- fs/coda/pioctl.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 6cbb3afb36dc..cb140ef293e4 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -43,8 +43,6 @@ const struct file_operations coda_ioctl_operations = { /* the coda pioctl inode ops */ static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; return (mask & MAY_EXEC) ? -EACCES : 0; } -- cgit v1.2.3 From a63ab94d67879bc0630ea9821c582ddf58ba5527 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:17:22 -0400 Subject: logfs doesn't need ->permission() at all ... and never did, what with its ->permission() being what we do by default when ->permission is NULL... Signed-off-by: Al Viro --- fs/logfs/dir.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 9ed89d1663f8..1afae26cf236 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -555,13 +555,6 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, return __logfs_create(dir, dentry, inode, target, destlen); } -static int logfs_permission(struct inode *inode, int mask, unsigned int flags) -{ - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return generic_permission(inode, mask, flags, NULL); -} - static int logfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { @@ -820,7 +813,6 @@ const struct inode_operations logfs_dir_iops = { .mknod = logfs_mknod, .rename = logfs_rename, .rmdir = logfs_rmdir, - .permission = logfs_permission, .symlink = logfs_symlink, .unlink = logfs_unlink, }; -- cgit v1.2.3 From 730e908f3539066d4aa01f4720ebfc750ce4d045 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:21:44 -0400 Subject: nilfs2_permission() doesn't need to bail out in RCU mode Nothing blocking except for generic_permission(). Which will DTRT. Signed-off-by: Al Viro --- fs/nilfs2/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b954878ad6ce..b9b45fc2903e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -801,12 +801,7 @@ out_err: int nilfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct nilfs_root *root; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - - root = NILFS_I(inode)->i_root; + struct nilfs_root *root = NILFS_I(inode)->i_root; if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ -- cgit v1.2.3 From cf1279111686d9742cbc4145bc9d526c83f59fea Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:35:23 -0400 Subject: proc_fd_permission() is doesn't need to bail out in RCU mode nothing blocking except generic_permission() Signed-off-by: Al Viro --- fs/proc/base.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 14def991d9dd..8a84210ca080 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2169,11 +2169,7 @@ static const struct file_operations proc_fd_operations = { */ static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { - int rv; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - rv = generic_permission(inode, mask, flags, NULL); + int rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) -- cgit v1.2.3 From 1d29b5a2ed7eb8862c9b66daf475f3e4c1a40299 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:37:33 -0400 Subject: reiserfs_permission() doesn't need to bail out in RCU mode nothing blocking other than generic_permission() (and check_acl callback does bail out in RCU mode). Signed-off-by: Al Viro --- fs/reiserfs/xattr.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e8a62f41b458..d78089690965 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -954,8 +954,6 @@ static int xattr_mount_check(struct super_block *s) int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; /* * We don't do permission checks on the internal objects. * Permissions are determined by the "owning" object. -- cgit v1.2.3 From 1aec7036d0c2996c86ce483ca0a28f3b20807b43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Jun 2011 20:42:00 -0400 Subject: proc_sys_permission() is OK in RCU mode nothing blocking there, since all instances of sysctl ->permissions() method are non-blocking - both of them, that is. Signed-off-by: Al Viro --- fs/proc/proc_sysctl.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f50133c11c24..d167de365a8d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -304,9 +304,6 @@ static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) struct ctl_table *table; int error; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; -- cgit v1.2.3 From 6291176bcd71a2766a19a10cbd9bab07d289e1d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Jun 2011 19:20:48 -0400 Subject: kill obsolete comment for follow_down() Signed-off-by: Al Viro --- fs/namei.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 9e425e7e6c8f..975c40620fe9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1011,9 +1011,6 @@ failed: * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. - * - * Care must be taken as namespace_sem may be held (indicated by mounting_here - * being true). */ int follow_down(struct path *path) { -- cgit v1.2.3 From 8e833fd2e1f0107ee7a4b6bc4de3c9f0e9b0ed41 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 19 Jun 2011 01:56:53 -0400 Subject: fix comment in generic_permission() CAP_DAC_OVERRIDE is enough for MAY_EXEC on directory, even if no exec bits are set. Signed-off-by: Al Viro --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 975c40620fe9..0223c41fb114 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -238,7 +238,8 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags, /* * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. + * Executable DACs are overridable for all directories and + * for non-directories that have least one exec bit set. */ if (!(mask & MAY_EXEC) || execute_ok(inode)) if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) -- cgit v1.2.3