From d1d84c9626bb3a519863b3ffc40d347166f9fb83 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 21 Aug 2014 15:04:31 -0400 Subject: nfsd4: fix response size estimation for OP_SEQUENCE We added this new estimator function but forgot to hook it up. The effect is that NFSv4.1 (and greater) won't do zero-copy reads. The estimate was also wrong by 8 bytes. Fixes: ccae70a9ee41 "nfsd4: estimate sequence response size" Cc: stable@vger.kernel.org Reported-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index cdeb3cfd6f32..f4bd578bed55 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1589,7 +1589,8 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) { - return NFS4_MAX_SESSIONID_LEN + 20; + return (op_encode_hdr_size + + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32); } static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) @@ -1893,6 +1894,7 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_sequence, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP, .op_name = "OP_SEQUENCE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_sequence_rsize, }, [OP_DESTROY_CLIENTID] = { .op_func = (nfsd4op_func)nfsd4_destroy_clientid, -- cgit v1.2.3 From b744c2ac4bbc040794efb33207d6ebc14f88ea2e Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 21 Oct 2014 13:55:09 -0600 Subject: fs: merge I/O error prints into one line buffer.c uses two printk calls to print these messages: [67353.422338] Buffer I/O error on device sdr, logical block 212868488 [67353.422338] lost page write due to I/O error on sdr In a busy system, they may be interleaved with other prints, losing the context for the second message. Merge them into one line with one printk call so the prints are atomic. Also, differentiate between async page writes, sync page writes, and async page reads. Also, shorten "device" to "dev" to match the block layer prints: [67353.467906] blk_update_request: critical target error, dev sdr, sector 1707107328 Also, use %llu rather than %Lu. Resulting prints look like: [ 1356.437006] blk_update_request: critical target error, dev sdr, sector 1719693992 [ 1361.383522] quiet_error: 659876 callbacks suppressed [ 1361.385816] Buffer I/O error on dev sdr, logical block 256902912, lost async page write [ 1361.385819] Buffer I/O error on dev sdr, logical block 256903644, lost async page write Signed-off-by: Robert Elliott Reviewed-by: Webb Scales Signed-off-by: Jens Axboe --- fs/buffer.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 6c48f20eddd4..9d1da1d314a2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -137,12 +137,12 @@ static int quiet_error(struct buffer_head *bh) } -static void buffer_io_error(struct buffer_head *bh) +static void buffer_io_error(struct buffer_head *bh, char *msg) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", + printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n", bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr); + (unsigned long long)bh->b_blocknr, msg); } /* @@ -177,17 +177,11 @@ EXPORT_SYMBOL(end_buffer_read_sync); void end_buffer_write_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) { - buffer_io_error(bh); - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } + if (!quiet_error(bh)) + buffer_io_error(bh, ", lost sync page write"); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } @@ -305,7 +299,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } else { clear_buffer_uptodate(bh); if (!quiet_error(bh)) - buffer_io_error(bh); + buffer_io_error(bh, ", async page read"); SetPageError(page); } @@ -353,7 +347,6 @@ still_busy: */ void end_buffer_async_write(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; unsigned long flags; struct buffer_head *first; struct buffer_head *tmp; @@ -365,12 +358,8 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) { - buffer_io_error(bh); - printk(KERN_WARNING "lost page write due to " - "I/O error on %s\n", - bdevname(bh->b_bdev, b)); - } + if (!quiet_error(bh)) + buffer_io_error(bh, ", lost async page write"); set_bit(AS_EIO, &page->mapping->flags); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); -- cgit v1.2.3 From 432f16e64f50fd4999a476543d04dd52f7a2d753 Mon Sep 17 00:00:00 2001 From: Robert Elliott Date: Tue, 21 Oct 2014 13:55:11 -0600 Subject: fs: clarify rate limit suppressed buffer I/O errors When quiet_error applies rate limiting to buffer_io_error calls, what the they apply to is unclear because the name is so generic, particularly if the messages are interleaved with others: [ 1936.063572] quiet_error: 664293 callbacks suppressed [ 1936.065297] Buffer I/O error on dev sdr, logical block 257429952, lost async page write [ 1936.067814] Buffer I/O error on dev sdr, logical block 257429953, lost async page write Also, the function uses printk_ratelimit(), although printk.h includes a comment advising "Please don't use... Instead use printk_ratelimited()." Change buffer_io_error to check the BH_Quiet bit itself, drop the printk_ratelimit call, and print using printk_ratelimited. This makes the messages look like: [ 387.208839] buffer_io_error: 676394 callbacks suppressed [ 387.210693] Buffer I/O error on dev sdr, logical block 211291776, lost async page write [ 387.213432] Buffer I/O error on dev sdr, logical block 211291777, lost async page write Signed-off-by: Robert Elliott Reviewed-by: Webb Scales Signed-off-by: Jens Axboe --- fs/buffer.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 9d1da1d314a2..20805db2c987 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -128,19 +128,13 @@ __clear_page_buffers(struct page *page) page_cache_release(page); } - -static int quiet_error(struct buffer_head *bh) -{ - if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit()) - return 0; - return 1; -} - - static void buffer_io_error(struct buffer_head *bh, char *msg) { char b[BDEVNAME_SIZE]; - printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n", + + if (!test_bit(BH_Quiet, &bh->b_state)) + printk_ratelimited(KERN_ERR + "Buffer I/O error on dev %s, logical block %llu%s\n", bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr, msg); } @@ -180,8 +174,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) - buffer_io_error(bh, ", lost sync page write"); + buffer_io_error(bh, ", lost sync page write"); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); } @@ -298,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) set_buffer_uptodate(bh); } else { clear_buffer_uptodate(bh); - if (!quiet_error(bh)) - buffer_io_error(bh, ", async page read"); + buffer_io_error(bh, ", async page read"); SetPageError(page); } @@ -358,8 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) if (uptodate) { set_buffer_uptodate(bh); } else { - if (!quiet_error(bh)) - buffer_io_error(bh, ", lost async page write"); + buffer_io_error(bh, ", lost async page write"); set_bit(AS_EIO, &page->mapping->flags); set_buffer_write_io_error(bh); clear_buffer_uptodate(bh); -- cgit v1.2.3 From 7938db449bbc55bbeb164bec7af406212e7e98f1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 16 Sep 2014 22:23:10 +0200 Subject: ext3: Don't check quota format when there are no quota files The check whether quota format is set even though there are no quota files with journalled quota is pointless and it actually makes it impossible to turn off journalled quotas (as there's no way to unset journalled quota format). Just remove the check. CC: stable@vger.kernel.org Signed-off-by: Jan Kara --- fs/ext3/super.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7015db0bafd1..eb742d0e67ff 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1354,13 +1354,6 @@ set_qf_format: "not specified."); return 0; } - } else { - if (sbi->s_jquota_fmt) { - ext3_msg(sb, KERN_ERR, "error: journaled quota format " - "specified with no journaling " - "enabled."); - return 0; - } } #endif return 1; -- cgit v1.2.3 From 474d2605d119479e5aa050f738632e63589d4bb5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Oct 2014 09:06:49 +0200 Subject: quota: Properly return errors from dquot_writeback_dquots() Due to a switched left and right side of an assignment, dquot_writeback_dquots() never returned error. This could result in errors during quota writeback to not be reported to userspace properly. Fix it. CC: stable@vger.kernel.org Coverity-id: 1226884 Signed-off-by: Jan Kara --- fs/quota/dquot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 8b663b2d9562..6b4527216a7f 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -634,7 +634,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type) dqstats_inc(DQST_LOOKUPS); err = sb->dq_op->write_dquot(dquot); if (!ret && err) - err = ret; + ret = err; dqput(dquot); spin_lock(&dq_list_lock); } -- cgit v1.2.3 From 3c9cafe05ff002eb84d438a02f3c8d468720463b Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 21 Oct 2014 16:43:55 -0400 Subject: fs, jbd: use a more generic hash function While the hash function used by the revoke hashtable is good somewhere else, it's not really good here. The default hash shift (8) means that one third of the hashing function gets lost (and is undefined anyways (8 - 12 = negative shift)): "(block << (hash_shift - 12))) & (table->hash_size - 1)" Instead, just use the kernel's generic hash function that gets used everywhere else. Signed-off-by: Sasha Levin Signed-off-by: Jan Kara --- fs/jbd/revoke.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 8898bbd2b61e..dcead636c33b 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -93,6 +93,7 @@ #include #endif #include +#include static struct kmem_cache *revoke_record_cache; static struct kmem_cache *revoke_table_cache; @@ -129,15 +130,11 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int); /* Utility functions to maintain the revoke table */ -/* Borrowed from buffer.c: this is a tried and tested block hash function */ static inline int hash(journal_t *journal, unsigned int block) { struct jbd_revoke_table_s *table = journal->j_revoke; - int hash_shift = table->hash_shift; - return ((block << (hash_shift - 6)) ^ - (block >> 13) ^ - (block << (hash_shift - 12))) & (table->hash_size - 1); + return hash_32(block, table->hash_shift); } static int insert_revoke_hash(journal_t *journal, unsigned int blocknr, -- cgit v1.2.3 From 51904b08072a8bf2b9ed74d1bd7a5300a614471d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 22 Oct 2014 14:46:29 -0400 Subject: nfsd4: fix crash on unknown operation number Unknown operation numbers are caught in nfsd4_decode_compound() which sets op->opnum to OP_ILLEGAL and op->status to nfserr_op_illegal. The error causes the main loop in nfsd4_proc_compound() to skip most processing. But nfsd4_proc_compound also peeks ahead at the next operation in one case and doesn't take similar precautions there. Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f4bd578bed55..0beb023f25ac 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1272,7 +1272,8 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp) */ if (argp->opcnt == resp->opcnt) return false; - + if (next->opnum == OP_ILLEGAL) + return false; nextd = OPDESC(next); /* * Rest of 2.6.3.1.1: certain operations will return WRONGSEC -- cgit v1.2.3 From 21e7626b12f25770e2975bc7c7b2e1d5b1d58a57 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 27 Oct 2014 13:52:21 +0100 Subject: btrfs: use macro accessors in superblock validation checks The initial patch c926093ec516f5d316 (btrfs: add more superblock checks) did not properly use the macro accessors that wrap endianness and the code would not work correctly on big endian machines. Reported-by: Qu Wenruo Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2409718e3f20..1ae1661ba14c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3817,19 +3817,19 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, struct btrfs_super_block *sb = fs_info->super_copy; int ret = 0; - if (sb->root_level > BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n", - sb->root_level, BTRFS_MAX_LEVEL); + if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", + btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } - if (sb->chunk_root_level > BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n", - sb->chunk_root_level, BTRFS_MAX_LEVEL); + if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n", + btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } - if (sb->log_root_level > BTRFS_MAX_LEVEL) { - printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n", - sb->log_root_level, BTRFS_MAX_LEVEL); + if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n", + btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } @@ -3837,15 +3837,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * The common minimum, we don't know if we can trust the nodesize/sectorsize * items yet, they'll be verified later. Issue just a warning. */ - if (!IS_ALIGNED(sb->root, 4096)) + if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", sb->root); - if (!IS_ALIGNED(sb->chunk_root, 4096)) + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", sb->chunk_root); - if (!IS_ALIGNED(sb->log_root, 4096)) + if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", - sb->log_root); + btrfs_super_log_root(sb)); if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", @@ -3857,13 +3857,13 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ - if (sb->num_devices > (1UL << 31)) + if (btrfs_super_num_devices(sb) > (1UL << 31)) printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", - sb->num_devices); + btrfs_super_num_devices(sb)); - if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) { + if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) { printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", - sb->bytenr, BTRFS_SUPER_INFO_OFFSET); + btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); ret = -EINVAL; } @@ -3871,14 +3871,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ - if (sb->generation < sb->chunk_root_generation) + if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) printk(KERN_WARNING "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n", - sb->generation, sb->chunk_root_generation); - if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1) + btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb)); + if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) + && btrfs_super_cache_generation(sb) != (u64)-1) printk(KERN_WARNING "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n", - sb->generation, sb->cache_generation); + btrfs_super_generation(sb), btrfs_super_cache_generation(sb)); return ret; } -- cgit v1.2.3 From 1a4ed8fdca077d2489ec47d548451be69389e926 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 27 Oct 2014 10:44:24 +0000 Subject: Btrfs: fix invalid leaf slot access in btrfs_lookup_extent() If we couldn't find our extent item, we accessed the current slot (path->slots[0]) to check if it corresponds to an equivalent skinny metadata item. However this slot could be beyond our last item in the leaf (i.e. path->slots[0] >= btrfs_header_nritems(leaf)), in which case we shouldn't process it. Since btrfs_lookup_extent() is only used to find extent items for data extents, fix this by removing completely the logic that looks up for an equivalent skinny metadata item, since it can not exist. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 2 +- fs/btrfs/extent-tree.c | 10 ++-------- fs/btrfs/tree-log.c | 2 +- 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d557264ee974..fe69edda11fb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3276,7 +3276,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_async_run_delayed_refs(struct btrfs_root *root, unsigned long count, int wait); -int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0d599ba1aaed..87c0b46f8a7e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) rcu_read_unlock(); } -/* simple helper to search for an existing extent at a given offset */ -int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +/* simple helper to search for an existing data extent at a given offset */ +int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) { int ret; struct btrfs_key key; @@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) key.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, 0, 0); - if (ret > 0) { - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == start && - key.type == BTRFS_METADATA_ITEM_KEY) - ret = 0; - } btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2b26dad35d88..6d58d72705ae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * is this extent already allocated in the extent * allocation tree? If so, just add a reference */ - ret = btrfs_lookup_extent(root, ins.objectid, + ret = btrfs_lookup_data_extent(root, ins.objectid, ins.offset); if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, -- cgit v1.2.3 From 5ed5f5884116e3841da626d201ef068f23232a3a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 15 Oct 2014 17:19:59 -0400 Subject: Btrfs: properly clean up btrfs_end_io_wq_cache In one of Dave's cleanup commits he forgot to call btrfs_end_io_wq_exit on unload, which makes us unable to unload and then re-load the btrfs module. This fixes the problem. Thanks, Signed-off-by: Josef Bacik Reviewed-by: David Sterba Reviewed-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a2b97ef10317..54bd91ece35b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2151,6 +2151,7 @@ static void __exit exit_btrfs_fs(void) extent_map_exit(); extent_io_exit(); btrfs_interface_exit(); + btrfs_end_io_wq_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); -- cgit v1.2.3 From d05a2b4cd97071462e77e6a7a8f109c36307182a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 27 Oct 2014 09:19:52 +0000 Subject: Btrfs: fix race that makes btrfs_lookup_extent_info miss skinny extent items We have a race that can lead us to miss skinny extent items in the function btrfs_lookup_extent_info() when the skinny metadata feature is enabled. So basically the sequence of steps is: 1) We search in the extent tree for the skinny extent, which returns > 0 (not found); 2) We check the previous item in the returned leaf for a non-skinny extent, and we don't find it; 3) Because we didn't find the non-skinny extent in step 2), we release our path to search the extent tree again, but this time for a non-skinny extent key; 4) Right after we released our path in step 3), a skinny extent was inserted in the extent tree (delayed refs were run) - our second extent tree search will miss it, because it's not looking for a skinny extent; 5) After the second search returned (with ret > 0), we look for any delayed ref for our extent's bytenr (and we do it while holding a read lock on the leaf), but we won't find any, as such delayed ref had just run and completed after we released out path in step 3) before doing the second search. Fix this by removing completely the path release and re-search logic. This is safe, because if we seach for a metadata item and we don't find it, we have the guarantee that the returned leaf is the one where the item would be inserted, and so path->slots[0] > 0 and path->slots[0] - 1 must be the slot where the non-skinny extent item is if it exists. The only case where path->slots[0] is zero is when there are no smaller keys in the tree (i.e. no left siblings for our leaf), in which case the re-search logic isn't needed as well. This race has been present since the introduction of skinny metadata (change 3173a18f70554fe7880bb2d85c7da566e364eb3c). Signed-off-by: Filipe Manana Reviewed-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 87c0b46f8a7e..a84e00da14f1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -780,7 +780,6 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; -again: ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 0); if (ret < 0) @@ -796,13 +795,6 @@ again: key.offset == root->nodesize) ret = 0; } - if (ret) { - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root->nodesize; - btrfs_release_path(path); - goto again; - } } if (ret == 0) { -- cgit v1.2.3 From a6bbce54efa9145dbcf3029c885549f7ebc40a3b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 29 Oct 2014 08:22:18 +1100 Subject: xfs: bulkstat doesn't release AGI buffer on error The recent refactoring of the bulkstat code left a small landmine in the code. If a inobt read fails, then the tree walk is aborted and returns without releasing the AGI buffer or freeing the cursor. This can lead to a subsequent bulkstat call hanging trying to grab the AGI buffer again. cc: Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f1deb961a296..ef8ea0589780 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -427,7 +427,7 @@ xfs_bulkstat( error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r); if (error) - break; + goto del_cursor; if (icount) { irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; @@ -442,7 +442,7 @@ xfs_bulkstat( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); } if (error) - break; + goto del_cursor; /* * Loop through inode btree records in this ag, @@ -454,7 +454,7 @@ xfs_bulkstat( error = xfs_inobt_get_rec(cur, &r, &i); if (error || i == 0) { end_of_ag = 1; - break; + goto del_cursor; } /* @@ -476,13 +476,17 @@ xfs_bulkstat( error = xfs_btree_increment(cur, 0, &tmp); cond_resched(); } + /* - * Drop the btree buffers and the agi buffer. - * We can't hold any of the locks these represent - * when calling iget. + * Drop the btree buffers and the agi buffer as we can't hold any + * of the locks these represent when calling iget. If there is a + * pending error, then we are done. */ +del_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_buf_relse(agbp); + if (error) + break; /* * Now format all the good inodes into the user's buffer. */ -- cgit v1.2.3 From 6424babfd68dd8a83d9c60a5242d27038856599f Mon Sep 17 00:00:00 2001 From: Jerry Hoemann Date: Wed, 29 Oct 2014 14:50:22 -0700 Subject: fsnotify: next_i is freed during fsnotify_unmount_inodes. During file system stress testing on 3.10 and 3.12 based kernels, the umount command occasionally hung in fsnotify_unmount_inodes in the section of code: spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); continue; } As this section of code holds the global inode_sb_list_lock, eventually the system hangs trying to acquire the lock. Multiple crash dumps showed: The inode->i_state == 0x60 and i_count == 0 and i_sb_list would point back at itself. As this is not the value of list upon entry to the function, the kernel never exits the loop. To help narrow down problem, the call to list_del_init in inode_sb_list_del was changed to list_del. This poisons the pointers in the i_sb_list and causes a kernel to panic if it transverse a freed inode. Subsequent stress testing paniced in fsnotify_unmount_inodes at the bottom of the list_for_each_entry_safe loop showing next_i had become free. We believe the root cause of the problem is that next_i is being freed during the window of time that the list_for_each_entry_safe loop temporarily releases inode_sb_list_lock to call fsnotify and fsnotify_inode_delete. The code in fsnotify_unmount_inodes attempts to prevent the freeing of inode and next_i by calling __iget. However, the code doesn't do the __iget call on next_i if i_count == 0 or if i_state & (I_FREEING | I_WILL_FREE) The patch addresses this issue by advancing next_i in the above two cases until we either find a next_i which we can __iget or we reach the end of the list. This makes the handling of next_i more closely match the handling of the variable "inode." The time to reproduce the hang is highly variable (from hours to days.) We ran the stress test on a 3.10 kernel with the proposed patch for a week without failure. During list_for_each_entry_safe, next_i is becoming free causing the loop to never terminate. Advance next_i in those cases where __iget is not done. Signed-off-by: Jerry Hoemann Cc: Jeff Kirsher Cc: Ken Helias Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/inode_mark.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 9ce062218de9..e8497144b323 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -288,20 +288,25 @@ void fsnotify_unmount_inodes(struct list_head *list) spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ - if ((&next_i->i_sb_list != list) && - atomic_read(&next_i->i_count)) { + while (&next_i->i_sb_list != list) { spin_lock(&next_i->i_lock); - if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) { + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && + atomic_read(&next_i->i_count)) { __iget(next_i); need_iput = next_i; + spin_unlock(&next_i->i_lock); + break; } spin_unlock(&next_i->i_lock); + next_i = list_entry(next_i->i_sb_list.next, + struct inode, i_sb_list); } /* - * We can safely drop inode_sb_list_lock here because we hold - * references on both inode and next_i. Also no new inodes - * will be added since the umount has begun. + * We can safely drop inode_sb_list_lock here because either + * we actually hold references on both inode and next_i or + * end of list. Also no new inodes will be added since the + * umount has begun. */ spin_unlock(&inode_sb_list_lock); -- cgit v1.2.3 From d3556babd7facb8fbc596bada0d67139e3b22330 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Wed, 29 Oct 2014 14:50:53 -0700 Subject: ocfs2: fix d_splice_alias() return code checking d_splice_alias() can return a valid dentry, NULL or an ERR_PTR. Currently the code checks not for ERR_PTR and will cuase an oops in ocfs2_dentry_attach_lock(). Fix this by using IS_ERR_OR_NULL(). Signed-off-by: Richard Weinberger Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8add6f1030d7..b931e04e3388 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -158,7 +158,7 @@ bail_add: * NOTE: This dentry already has ->d_op set from * ocfs2_get_parent() and ocfs2_get_dentry() */ - if (ret) + if (!IS_ERR_OR_NULL(ret)) dentry = ret; status = ocfs2_dentry_attach_lock(dentry, inode, -- cgit v1.2.3 From 7a19dee116c8fae7ba7a778043c245194289f5a2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Oct 2014 10:34:52 +1100 Subject: xfs: Check error during inode btree iteration in xfs_bulkstat() xfs_bulkstat() doesn't check error return from xfs_btree_increment(). In case of specific fs corruption that could result in xfs_bulkstat() entering an infinite loop because we would be looping over the same chunk over and over again. Fix the problem by checking the return value and terminating the loop properly. Coverity-id: 1231338 cc: Signed-off-by: Jan Kara Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ef8ea0589780..7765ff743e91 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -474,6 +474,10 @@ xfs_bulkstat( */ agino = r.ir_startino + XFS_INODES_PER_CHUNK; error = xfs_btree_increment(cur, 0, &tmp); + if (error) { + end_of_ag = 1; + goto del_cursor; + } cond_resched(); } -- cgit v1.2.3 From 5d11fb4b9a1d90983452c029b5e1377af78fda49 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 30 Oct 2014 10:35:11 +1100 Subject: xfs: rework zero range to prevent invalid i_size updates The zero range operation is analogous to fallocate with the exception of converting the range to zeroes. E.g., it attempts to allocate zeroed blocks over the range specified by the caller. The XFS implementation kills all delalloc blocks currently over the aligned range, converts the range to allocated zero blocks (unwritten extents) and handles the partial pages at the ends of the range by sending writes through the pagecache. The current implementation suffers from several problems associated with inode size. If the aligned range covers an extending I/O, said I/O is discarded and an inode size update from a previous write never makes it to disk. Further, if an unaligned zero range extends beyond eof, the page write induced for the partial end page can itself increase the inode size, even if the zero range request is not supposed to update i_size (via KEEP_SIZE, similar to an fallocate beyond EOF). The latter behavior not only incorrectly increases the inode size, but can lead to stray delalloc blocks on the inode. Typically, post-eof preallocation blocks are either truncated on release or inode eviction or explicitly written to by xfs_zero_eof() on natural file size extension. If the inode size increases due to zero range, however, associated blocks leak into the address space having never been converted or mapped to pagecache pages. A direct I/O to such an uncovered range cannot convert the extent via writeback and will BUG(). For example: $ xfs_io -fc "pwrite 0 128k" -c "fzero -k 1m 54321" ... $ xfs_io -d -c "pread 128k 128k" If the entire delalloc extent happens to not have page coverage whatsoever (e.g., delalloc conversion couldn't find a large enough free space extent), even a full file writeback won't convert what's left of the extent and we'll assert on inode eviction. Rework xfs_zero_file_space() to avoid buffered I/O for partial pages. Use the existing hole punch and prealloc mechanisms as primitives for zero range. This implementation is not efficient nor ideal as we writeback dirty data over the range and remove existing extents rather than convert to unwrittern. The former writeback, however, is currently the only mechanism available to ensure consistency between pagecache and extent state. Even a pagecache truncate/delalloc punch prior to hole punch has lead to inconsistencies due to racing with writeback. This provides a consistent, correct implementation of zero range that survives fsstress/fsx testing without assert failures. The implementation can be optimized from this point forward once the fundamental issue of pagecache and delalloc extent state consistency is addressed. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 72 ++++++++++++++------------------------------------ 1 file changed, 20 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 92e8f99a5857..281002689d64 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1338,7 +1338,10 @@ xfs_free_file_space( goto out; } - +/* + * Preallocate and zero a range of a file. This mechanism has the allocation + * semantics of fallocate and in addition converts data in the range to zeroes. + */ int xfs_zero_file_space( struct xfs_inode *ip, @@ -1346,65 +1349,30 @@ xfs_zero_file_space( xfs_off_t len) { struct xfs_mount *mp = ip->i_mount; - uint granularity; - xfs_off_t start_boundary; - xfs_off_t end_boundary; + uint blksize; int error; trace_xfs_zero_file_space(ip); - granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + blksize = 1 << mp->m_sb.sb_blocklog; /* - * Round the range of extents we are going to convert inwards. If the - * offset is aligned, then it doesn't get changed so we zero from the - * start of the block offset points to. + * Punch a hole and prealloc the range. We use hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued + * by virtue of the hole punch. */ - start_boundary = round_up(offset, granularity); - end_boundary = round_down(offset + len, granularity); - - ASSERT(start_boundary >= offset); - ASSERT(end_boundary <= offset + len); - - if (start_boundary < end_boundary - 1) { - /* - * Writeback the range to ensure any inode size updates due to - * appending writes make it to disk (otherwise we could just - * punch out the delalloc blocks). - */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - start_boundary, end_boundary - 1); - if (error) - goto out; - truncate_pagecache_range(VFS_I(ip), start_boundary, - end_boundary - 1); - - /* convert the blocks */ - error = xfs_alloc_file_space(ip, start_boundary, - end_boundary - start_boundary - 1, - XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT); - if (error) - goto out; - - /* We've handled the interior of the range, now for the edges */ - if (start_boundary != offset) { - error = xfs_iozero(ip, offset, start_boundary - offset); - if (error) - goto out; - } - - if (end_boundary != offset + len) - error = xfs_iozero(ip, end_boundary, - offset + len - end_boundary); - - } else { - /* - * It's either a sub-granularity range or the range spanned lies - * partially across two adjacent blocks. - */ - error = xfs_iozero(ip, offset, len); - } + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out; + error = xfs_alloc_file_space(ip, round_down(offset, blksize), + round_up(offset + len, blksize) - + round_down(offset, blksize), + XFS_BMAPI_PREALLOC); out: return error; -- cgit v1.2.3 From 9378c6768e4fca48971e7b6a9075bc006eda981d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Oct 2014 10:52:57 -0400 Subject: ext4: fix overflow when updating superblock backups after resize When there are no meta block groups update_backups() will compute the backup block in 32-bit arithmetics thus possibly overflowing the block number and corrupting the filesystem. OTOH filesystems without meta block groups larger than 16 TB should be rare. Fix the problem by doing the counting in 64-bit arithmetics. Coverity-id: 741252 CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Reviewed-by: Lukas Czerner --- fs/ext4/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index f298c60f907d..ca4588388fc3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1081,7 +1081,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, break; if (meta_bg == 0) - backup_block = group * bpg + blk_off; + backup_block = ((ext4_fsblk_t)group) * bpg + blk_off; else backup_block = (ext4_group_first_block_no(sb, group) + ext4_bg_has_super(sb, group)); -- cgit v1.2.3 From 599a9b77ab289d85c2d5c8607624efbe1f552b0f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Oct 2014 10:53:16 -0400 Subject: ext4: fix oops when loading block bitmap failed When we fail to load block bitmap in __ext4_new_inode() we will dereference NULL pointer in ext4_journal_get_write_access(). So check for error from ext4_read_block_bitmap(). Coverity-id: 989065 Cc: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8012a5daf401..ac644c31ca67 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -887,6 +887,10 @@ got: struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); + if (!block_bitmap_bh) { + err = -EIO; + goto out; + } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { -- cgit v1.2.3 From 98c1a7593fa355fda7f5a5940c8bf5326ca964ba Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Oct 2014 10:53:16 -0400 Subject: ext4: enable journal checksum when metadata checksum feature enabled If metadata checksumming is turned on for the FS, we need to tell the journal to use checksumming too. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1eda6ab0ef9d..5c11e2153e36 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3526,6 +3526,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_EXT4_FS_POSIX_ACL set_opt(sb, POSIX_ACL); #endif + /* don't forget to enable journal_csum when metadata_csum is enabled. */ + if (ext4_has_metadata_csum(sb)) + set_opt(sb, JOURNAL_CHECKSUM); + if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) set_opt(sb, JOURNAL_DATA); else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) -- cgit v1.2.3 From 6b992ff25658367089db4a82666e232b65d55eae Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Oct 2014 10:53:16 -0400 Subject: ext4: disallow changing journal_csum option during remount ext4 does not permit changing the metadata or journal checksum feature flag while mounted. Until we decide to support that, don't allow a remount to change the journal_csum flag (right now we silently fail to change anything). Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5c11e2153e36..96059e0ef165 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4845,6 +4845,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ + test_opt(sb, JOURNAL_CHECKSUM)) { + ext4_msg(sb, KERN_ERR, "changing journal_checksum " + "during remount not supported"); + err = -EINVAL; + goto restore_opts; + } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " -- cgit v1.2.3 From 50460fe8c6d1d95b16427936e351f277a1c72d43 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 30 Oct 2014 10:53:16 -0400 Subject: ext4: remove extent status procfs files if journal load fails If we can't load the journal, remove the procfs files for the extent status information file to avoid leaking resources. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 96059e0ef165..2c9e6864abd9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3947,7 +3947,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && !(sb->s_flags & MS_RDONLY)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) - goto failed_mount3; + goto failed_mount3a; /* * The first inode we look at is the journal inode. Don't try @@ -3956,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (!test_opt(sb, NOLOAD) && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { if (ext4_load_journal(sb, es, journal_devnum)) - goto failed_mount3; + goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { ext4_msg(sb, KERN_ERR, "required journal recovery " @@ -4244,6 +4244,7 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } +failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: del_timer_sync(&sbi->s_err_report); -- cgit v1.2.3 From a41537e69b4aa43f0fea02498c2595a81267383b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Thu, 30 Oct 2014 10:53:16 -0400 Subject: ext4: prevent bugon on race between write/fcntl O_DIRECT flags can be toggeled via fcntl(F_SETFL). But this value checked twice inside ext4_file_write_iter() and __generic_file_write() which result in BUG_ON inside ext4_direct_IO. Let's initialize iocb->private unconditionally. TESTCASE: xfstest:generic/036 https://patchwork.ozlabs.org/patch/402445/ #TYPICAL STACK TRACE: kernel BUG at fs/ext4/inode.c:2960! invalid opcode: 0000 [#1] SMP Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod CPU: 6 PID: 5505 Comm: aio-dio-fcntl-r Not tainted 3.17.0-rc2-00176-gff5c017 #161 Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011 task: ffff88080e95a7c0 ti: ffff88080f908000 task.ti: ffff88080f908000 RIP: 0010:[] [] ext4_direct_IO+0x162/0x3d0 RSP: 0018:ffff88080f90bb58 EFLAGS: 00010246 RAX: 0000000000000400 RBX: ffff88080fdb2a28 RCX: 00000000a802c818 RDX: 0000040000080000 RSI: ffff88080d8aeb80 RDI: 0000000000000001 RBP: ffff88080f90bbc8 R08: 0000000000000000 R09: 0000000000001581 R10: 0000000000000000 R11: 0000000000000000 R12: ffff88080d8aeb80 R13: ffff88080f90bbf8 R14: ffff88080fdb28c8 R15: ffff88080fdb2a28 FS: 00007f23b2055700(0000) GS:ffff880818400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f23b2045000 CR3: 000000080cedf000 CR4: 00000000000407e0 Stack: ffff88080f90bb98 0000000000000000 7ffffffffffffffe ffff88080fdb2c30 0000000000000200 0000000000000200 0000000000000001 0000000000000200 ffff88080f90bbc8 ffff88080fdb2c30 ffff88080f90be08 0000000000000200 Call Trace: [] generic_file_direct_write+0xed/0x180 [] __generic_file_write_iter+0x222/0x370 [] ext4_file_write_iter+0x34b/0x400 [] ? aio_run_iocb+0x239/0x410 [] ? aio_run_iocb+0x239/0x410 [] ? local_clock+0x25/0x30 [] ? __lock_acquire+0x274/0x700 [] ? ext4_unwritten_wait+0xb0/0xb0 [] aio_run_iocb+0x286/0x410 [] ? local_clock+0x25/0x30 [] ? lock_release_holdtime+0x29/0x190 [] ? lookup_ioctx+0x4b/0xf0 [] do_io_submit+0x55b/0x740 [] ? do_io_submit+0x3ca/0x740 [] SyS_io_submit+0x10/0x20 [] system_call_fastpath+0x16/0x1b Code: 01 48 8b 80 f0 01 00 00 48 8b 18 49 8b 45 10 0f 85 f1 01 00 00 48 03 45 c8 48 3b 43 48 0f 8f e3 01 00 00 49 83 7c 24 18 00 75 04 <0f> 0b eb fe f0 ff 83 ec 01 00 00 49 8b 44 24 18 8b 00 85 c0 89 RIP [] ext4_direct_IO+0x162/0x3d0 RSP Reported-by: Sasha Levin Signed-off-by: Theodore Ts'o Signed-off-by: Dmitry Monakhov Cc: stable@vger.kernel.org --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index aca7b24a4432..8131be8c0af3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -137,10 +137,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos); } + iocb->private = &overwrite; if (o_direct) { blk_start_plug(&plug); - iocb->private = &overwrite; /* check whether we do a DIO overwrite or not */ if (ext4_should_dioread_nolock(inode) && !aio_mutex && -- cgit v1.2.3 From d48458d4a768cece43f80a081a26cf912877da9c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 30 Oct 2014 10:53:17 -0400 Subject: jbd2: use a better hash function for the revoke table The old hash function didn't work well for 64-bit block numbers, and used undefined (negative) shift right behavior. Use the generic 64-bit hash function instead. Signed-off-by: Theodore Ts'o Reported-by: Andrey Ryabinin --- fs/jbd2/revoke.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index d5e95a175c92..c6cbaef2bda1 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -92,6 +92,7 @@ #include #include #include +#include #endif static struct kmem_cache *jbd2_revoke_record_cache; @@ -130,16 +131,9 @@ static void flush_descriptor(journal_t *, struct buffer_head *, int, int); /* Utility functions to maintain the revoke table */ -/* Borrowed from buffer.c: this is a tried and tested block hash function */ static inline int hash(journal_t *journal, unsigned long long block) { - struct jbd2_revoke_table_s *table = journal->j_revoke; - int hash_shift = table->hash_shift; - int hash = (int)block ^ (int)((block >> 31) >> 1); - - return ((hash << (hash_shift - 6)) ^ - (hash >> 13) ^ - (hash << (hash_shift - 12))) & (table->hash_size - 1); + return hash_64(block, journal->j_revoke->hash_shift); } static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, -- cgit v1.2.3 From 6050d47adcadbb53582434d919ed7f038d936712 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Oct 2014 10:53:17 -0400 Subject: ext4: bail out from make_indexed_dir() on first error When ext4_handle_dirty_dx_node() or ext4_handle_dirty_dirent_node() fail, there's really something wrong with the fs and there's no point in continuing further. Just return error from make_indexed_dir() in that case. Also initialize frames array so that if we return early due to error, dx_release() doesn't try to dereference uninitialized memory (which could happen also due to error in do_split()). Coverity-id: 741300 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 123798c5ac31..426211882f72 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1816,31 +1816,39 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; ext4fs_dirhash(name, namelen, &hinfo); + memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; frame->at = entries; frame->bh = bh; bh = bh2; - ext4_handle_dirty_dx_node(handle, dir, frame->bh); - ext4_handle_dirty_dirent_node(handle, dir, bh); + retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); + if (retval) + goto out_frames; + retval = ext4_handle_dirty_dirent_node(handle, dir, bh); + if (retval) + goto out_frames; de = do_split(handle,dir, &bh, frame, &hinfo); if (IS_ERR(de)) { - /* - * Even if the block split failed, we have to properly write - * out all the changes we did so far. Otherwise we can end up - * with corrupted filesystem. - */ - ext4_mark_inode_dirty(handle, dir); - dx_release(frames); - return PTR_ERR(de); + retval = PTR_ERR(de); + goto out_frames; } dx_release(frames); retval = add_dirent_to_buf(handle, dentry, inode, de, bh); brelse(bh); return retval; +out_frames: + /* + * Even if the block split failed, we have to properly write + * out all the changes we did so far. Otherwise we can end up + * with corrupted filesystem. + */ + ext4_mark_inode_dirty(handle, dir); + dx_release(frames); + return retval; } /* -- cgit v1.2.3 From 4f879ca687a5f2473b952937ce92c795a39019b4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Oct 2014 10:53:17 -0400 Subject: ext4: bail early when clearing inode journal flag fails When clearing inode journal flag, we call jbd2_journal_flush() to force all the journalled data to their final locations. Currently we ignore when this fails and continue clearing inode journal flag. This isn't a big problem because when jbd2_journal_flush() fails, journal is likely aborted anyway. But it can still lead to somewhat confusing results so rather bail out early. Coverity-id: 989044 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e9777f93cf05..3356ab5395f4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4959,7 +4959,12 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) if (val) ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); else { - jbd2_journal_flush(journal); + err = jbd2_journal_flush(journal); + if (err < 0) { + jbd2_journal_unlock_updates(journal); + ext4_inode_resume_unlocked_dio(inode); + return err; + } ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); -- cgit v1.2.3 From ae9e9c6aeea6f91ccb4fb369d7dd8f1a8b5f6a58 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Oct 2014 10:53:17 -0400 Subject: ext4: make ext4_ext_convert_to_initialized() return proper number of blocks ext4_ext_convert_to_initialized() can return more blocks than are actually allocated from map->m_lblk in case where initial part of the on-disk extent is zeroed out. Luckily this doesn't have serious consequences because the caller currently uses the return value only to unmap metadata buffers. Anyway this is a data corruption/exposure problem waiting to happen so fix it. Coverity-id: 1226848 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 37043d0b2be8..0b16fb4c06d3 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3603,11 +3603,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } - allocated = ext4_split_extent(handle, inode, ppath, - &split_map, split_flag, flags); - if (allocated < 0) - err = allocated; - + err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, + flags); + if (err > 0) + err = 0; out: /* If we have gotten a failure, don't zero out status tree */ if (!err) -- cgit v1.2.3 From 69a91c237ab0ebe4e9fdeaf6d0090c85275594ec Mon Sep 17 00:00:00 2001 From: Eric Rannaud Date: Thu, 30 Oct 2014 01:51:01 -0700 Subject: fs: allow open(dir, O_TMPFILE|..., 0) with mode 0 The man page for open(2) indicates that when O_CREAT is specified, the 'mode' argument applies only to future accesses to the file: Note that this mode applies only to future accesses of the newly created file; the open() call that creates a read-only file may well return a read/write file descriptor. The man page for open(2) implies that 'mode' is treated identically by O_CREAT and O_TMPFILE. O_TMPFILE, however, behaves differently: int fd = open("/tmp", O_TMPFILE | O_RDWR, 0); assert(fd == -1); assert(errno == EACCES); int fd = open("/tmp", O_TMPFILE | O_RDWR, 0600); assert(fd > 0); For O_CREAT, do_last() sets acc_mode to MAY_OPEN only: if (*opened & FILE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = MAY_OPEN; path_to_nameidata(path, nd); goto finish_open_created; } But for O_TMPFILE, do_tmpfile() passes the full op->acc_mode to may_open(). This patch lines up the behavior of O_TMPFILE with O_CREAT. After the inode is created, may_open() is called with acc_mode = MAY_OPEN, in do_tmpfile(). A different, but related glibc bug revealed the discrepancy: https://sourceware.org/bugzilla/show_bug.cgi?id=17523 The glibc lazily loads the 'mode' argument of open() and openat() using va_arg() only if O_CREAT is present in 'flags' (to support both the 2 argument and the 3 argument forms of open; same idea for openat()). However, the glibc ignores the 'mode' argument if O_TMPFILE is in 'flags'. On x86_64, for open(), it magically works anyway, as 'mode' is in RDX when entering open(), and is still in RDX on SYSCALL, which is where the kernel looks for the 3rd argument of a syscall. But openat() is not quite so lucky: 'mode' is in RCX when entering the glibc wrapper for openat(), while the kernel looks for the 4th argument of a syscall in R10. Indeed, the syscall calling convention differs from the regular calling convention in this respect on x86_64. So the kernel sees mode = 0 when trying to use glibc openat() with O_TMPFILE, and fails with EACCES. Signed-off-by: Eric Rannaud Acked-by: Andy Lutomirski Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 42df664e95e5..78512898d3ba 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3154,7 +3154,8 @@ static int do_tmpfile(int dfd, struct filename *pathname, if (error) goto out2; audit_inode(pathname, nd->path.dentry, 0); - error = may_open(&nd->path, op->acc_mode, op->open_flag); + /* Don't check for other permissions, the inode was just created */ + error = may_open(&nd->path, MAY_OPEN, op->open_flag); if (error) goto out2; file->f_path.mnt = nd->path.mnt; -- cgit v1.2.3 From 6e5aafb27419f32575b27ef9d6a31e5d54661aca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 4 Nov 2014 06:59:04 -0800 Subject: Btrfs: fix kfree on list_head in btrfs_lookup_csums_range error cleanup If we hit any errors in btrfs_lookup_csums_range, we'll loop through all the csums we allocate and free them. But the code was using list_entry incorrectly, and ended up trying to free the on-stack list_head instead. This bug came from commit 0678b6185 btrfs: Don't BUG_ON kzalloc error in btrfs_lookup_csums_range() Signed-off-by: Chris Mason Reported-by: Erik Berg cc: stable@vger.kernel.org # 3.3 or newer --- fs/btrfs/file-item.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 783a94355efd..84a2d1868271 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -413,7 +413,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, ret = 0; fail: while (ret < 0 && !list_empty(&tmplist)) { - sums = list_entry(&tmplist, struct btrfs_ordered_sum, list); + sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list); list_del(&sums->list); kfree(sums); } -- cgit v1.2.3 From 809fd143de8805970eec02c27c0bc2622a6ecbda Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Oct 2014 19:33:14 +0300 Subject: NFSv4: Ensure nfs_atomic_open set the dentry verifier on ENOENT If the OPEN rpc call to the server fails with an ENOENT call, nfs_atomic_open will create a negative dentry for that file, however it currently fails to call nfs_set_verifier(), thus causing the dentry to be immediately revalidated on the next call to nfs_lookup_revalidate() instead of following the usual lookup caching rules. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 06e8cfcbb670..6e62155abf26 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1527,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, case -ENOENT: d_drop(dentry); d_add(dentry, NULL); + nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; case -EISDIR: case -ENOTDIR: -- cgit v1.2.3 From 7488cbc2568391d5e0b2bda8902a96b5dd7b1ea7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Oct 2014 19:22:31 +0300 Subject: Revert "NFS: remove BUG possibility in nfs4_open_and_get_state" This reverts commit f39c01047994e66e7f3d89ddb4c6141f23349d8d. --- fs/nfs/nfs4proc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 405bd95c1f58..8026197e2b9f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2233,13 +2233,9 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ret = _nfs4_proc_open(opendata); if (ret != 0) { if (ret == -ENOENT) { - dentry = opendata->dentry; - if (dentry->d_inode) - d_delete(dentry); - else if (d_unhashed(dentry)) - d_add(dentry, NULL); - - nfs_set_verifier(dentry, + d_drop(opendata->dentry); + d_add(opendata->dentry, NULL); + nfs_set_verifier(opendata->dentry, nfs_save_change_attribute(opendata->dir->d_inode)); } goto out; -- cgit v1.2.3 From dca780016dab84d6ac500b1d84fdfe1628802a59 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 23 Oct 2014 19:23:03 +0300 Subject: Revert "NFS: nfs4_do_open should add negative results to the dcache." This reverts commit 4fa2c54b5198d09607a534e2fd436581064587ed. --- fs/nfs/nfs4proc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8026197e2b9f..41b8fcbfdadd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2231,15 +2231,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); ret = _nfs4_proc_open(opendata); - if (ret != 0) { - if (ret == -ENOENT) { - d_drop(opendata->dentry); - d_add(opendata->dentry, NULL); - nfs_set_verifier(opendata->dentry, - nfs_save_change_attribute(opendata->dir->d_inode)); - } + if (ret != 0) goto out; - } state = nfs4_opendata_to_nfs4_state(opendata); ret = PTR_ERR(state); -- cgit v1.2.3 From 3f822c6264954660babce757fb45792fd3af273e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 4 Nov 2014 16:11:03 +0100 Subject: ovl: don't poison cursor ovl_cache_put() can be called from ovl_dir_reset() if the cache needs to be rebuilt. We did list_del() on the cursor, which results in an Oops on the poisoned pointer in ovl_seek_cursor(). Reported-by: Jordi Pujol Palomer Signed-off-by: Miklos Szeredi Tested-by: Jordi Pujol Palomer Signed-off-by: Al Viro --- fs/overlayfs/readdir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 4e9d7c1fea52..2a7ef4f8e2a6 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -168,7 +168,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) { struct ovl_dir_cache *cache = od->cache; - list_del(&od->cursor.l_node); + list_del_init(&od->cursor.l_node); WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { -- cgit v1.2.3 From 7e8631e8b9d4e9f698c09c7e7309c96249180ff9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Nov 2014 15:18:29 -0500 Subject: fix breakage in o2net_send_tcp_msg() uninitialized msghdr. Broken in "ocfs2: don't open-code kernel_recvmsg()" by me ;-/ Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Al Viro --- fs/ocfs2/cluster/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 97de0fbd9f78..a96044004064 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -925,7 +925,7 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - struct msghdr msg; + struct msghdr msg = {.msg_flags = 0,}; if (sock == NULL) { ret = -EINVAL; -- cgit v1.2.3 From afa947cb52a8e73fe71915a0b0af6fcf98dfbe1a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:29:57 +1100 Subject: xfs: bulkstat btree walk doesn't terminate The bulkstat code has several different ways of detecting the end of an AG when doing a walk. They are not consistently detected, and the code that checks for the end of AG conditions is not consistently coded. Hence the are conditions where the walk code can get stuck in an endless loop making no progress and not triggering any termination conditions. Convert all the "tmp/i" status return codes from btree operations to a common name (stat) and apply end-of-ag detection to these operations consistently. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7765ff743e91..16737cbbee17 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -356,7 +356,6 @@ xfs_bulkstat( int end_of_ag; /* set if we've seen the ag end */ int error; /* error code */ int fmterror;/* bulkstat formatter result */ - int i; /* loop index */ int icount; /* count of inodes good in irbuf */ size_t irbsize; /* size of irec buffer in bytes */ xfs_ino_t ino; /* inode number (filesystem) */ @@ -366,11 +365,11 @@ xfs_bulkstat( xfs_ino_t lastino; /* last inode number returned */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ - int tmp; /* result value from btree calls */ int ubcount; /* size of user's buffer */ int ubleft; /* bytes left in user's buffer */ char __user *ubufp; /* pointer into user's buffer */ int ubelem; /* spaces used in user's buffer */ + int stat; /* * Get the last inode value, see if there's nothing to do. @@ -436,13 +435,15 @@ xfs_bulkstat( agino = r.ir_startino + XFS_INODES_PER_CHUNK; } /* Increment to the next record */ - error = xfs_btree_increment(cur, 0, &tmp); + error = xfs_btree_increment(cur, 0, &stat); } else { /* Start of ag. Lookup the first inode chunk */ - error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat); } - if (error) + if (error || stat == 0) { + end_of_ag = 1; goto del_cursor; + } /* * Loop through inode btree records in this ag, @@ -451,8 +452,8 @@ xfs_bulkstat( while (irbp < irbufend && icount < ubcount) { struct xfs_inobt_rec_incore r; - error = xfs_inobt_get_rec(cur, &r, &i); - if (error || i == 0) { + error = xfs_inobt_get_rec(cur, &r, &stat); + if (error || stat == 0) { end_of_ag = 1; goto del_cursor; } @@ -473,8 +474,8 @@ xfs_bulkstat( * Set agino to after this chunk and bump the cursor. */ agino = r.ir_startino + XFS_INODES_PER_CHUNK; - error = xfs_btree_increment(cur, 0, &tmp); - if (error) { + error = xfs_btree_increment(cur, 0, &stat); + if (error || stat == 0) { end_of_ag = 1; goto del_cursor; } -- cgit v1.2.3 From bf4a5af20d25ecc8876978ad34b8db83b4235f3c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:30:30 +1100 Subject: xfs: bulkstat chunk formatting cursor is broken The xfs_bulkstat_agichunk formatting cursor takes buffer values from the main loop and passes them via the structure to the chunk formatter, and the writes the changed values back into the main loop local variables. Unfortunately, this complex dance is full of corner cases that aren't handled correctly. The biggest problem is that it is double handling the information in both the main loop and the chunk formatting function, leading to inconsistent updates and endless loops where progress is not made. To fix this, push the struct xfs_bulkstat_agichunk outwards to be the primary holder of user buffer information. this removes the double handling in the main loop. Also, pass the last inode processed by the chunk formatter as a separate parameter as it purely an output variable and is not related to the user buffer consumption cursor. Finally, the chunk formatting code is not shared by anyone, so make it local to xfs_itable.c. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 59 +++++++++++++++++++++++++---------------------------- fs/xfs/xfs_itable.h | 16 --------------- 2 files changed, 28 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 16737cbbee17..50a3e5995dd9 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -262,20 +262,26 @@ xfs_bulkstat_grab_ichunk( #define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) +struct xfs_bulkstat_agichunk { + char __user **ac_ubuffer;/* pointer into user's buffer */ + int ac_ubleft; /* bytes left in user's buffer */ + int ac_ubelem; /* spaces used in user's buffer */ +}; + /* * Process inodes in chunk with a pointer to a formatter function * that will iget the inode and fill in the appropriate structure. */ -int +static int xfs_bulkstat_ag_ichunk( struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_inobt_rec_incore *irbp, bulkstat_one_pf formatter, size_t statstruct_size, - struct xfs_bulkstat_agichunk *acp) + struct xfs_bulkstat_agichunk *acp, + xfs_ino_t *lastino) { - xfs_ino_t lastino = acp->ac_lastino; char __user **ubufp = acp->ac_ubuffer; int ubleft = acp->ac_ubleft; int ubelem = acp->ac_ubelem; @@ -295,7 +301,7 @@ xfs_bulkstat_ag_ichunk( /* Skip if this inode is free */ if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { - lastino = ino; + *lastino = ino; continue; } @@ -313,7 +319,7 @@ xfs_bulkstat_ag_ichunk( ubleft = 0; break; } - lastino = ino; + *lastino = ino; continue; } if (fmterror == BULKSTAT_RV_GIVEUP) { @@ -325,10 +331,9 @@ xfs_bulkstat_ag_ichunk( *ubufp += ubused; ubleft -= ubused; ubelem++; - lastino = ino; + *lastino = ino; } - acp->ac_lastino = lastino; acp->ac_ubleft = ubleft; acp->ac_ubelem = ubelem; @@ -355,7 +360,6 @@ xfs_bulkstat( xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ int end_of_ag; /* set if we've seen the ag end */ int error; /* error code */ - int fmterror;/* bulkstat formatter result */ int icount; /* count of inodes good in irbuf */ size_t irbsize; /* size of irec buffer in bytes */ xfs_ino_t ino; /* inode number (filesystem) */ @@ -366,10 +370,8 @@ xfs_bulkstat( int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int ubcount; /* size of user's buffer */ - int ubleft; /* bytes left in user's buffer */ - char __user *ubufp; /* pointer into user's buffer */ - int ubelem; /* spaces used in user's buffer */ int stat; + struct xfs_bulkstat_agichunk ac; /* * Get the last inode value, see if there's nothing to do. @@ -386,11 +388,13 @@ xfs_bulkstat( } ubcount = *ubcountp; /* statstruct's */ - ubleft = ubcount * statstruct_size; /* bytes */ - *ubcountp = ubelem = 0; + ac.ac_ubuffer = &ubuffer; + ac.ac_ubleft = ubcount * statstruct_size; /* bytes */; + ac.ac_ubelem = 0; + + *ubcountp = 0; *done = 0; - fmterror = 0; - ubufp = ubuffer; + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); if (!irbuf) return -ENOMEM; @@ -402,7 +406,7 @@ xfs_bulkstat( * inode returned; 0 means start of the allocation group. */ rval = 0; - while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { + while (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft) && agno < mp->m_sb.sb_agcount) { cond_resched(); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); if (error) @@ -497,28 +501,21 @@ del_cursor: */ irbufend = irbp; for (irbp = irbuf; - irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { - struct xfs_bulkstat_agichunk ac; - - ac.ac_lastino = lastino; - ac.ac_ubuffer = &ubuffer; - ac.ac_ubleft = ubleft; - ac.ac_ubelem = ubelem; + irbp < irbufend && XFS_BULKSTAT_UBLEFT(ac.ac_ubleft); + irbp++) { error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, - formatter, statstruct_size, &ac); + formatter, statstruct_size, &ac, + &lastino); if (error) rval = error; - lastino = ac.ac_lastino; - ubleft = ac.ac_ubleft; - ubelem = ac.ac_ubelem; - cond_resched(); } + /* * Set up for the next loop iteration. */ - if (XFS_BULKSTAT_UBLEFT(ubleft)) { + if (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft)) { if (end_of_ag) { agno++; agino = 0; @@ -531,11 +528,11 @@ del_cursor: * Done, we're either out of filesystem or space to put the data. */ kmem_free(irbuf); - *ubcountp = ubelem; + *ubcountp = ac.ac_ubelem; /* * Found some inodes, return them now and return the error next time. */ - if (ubelem) + if (ac.ac_ubelem) rval = 0; if (agno >= mp->m_sb.sb_agcount) { /* diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index aaed08022eb9..6ea8b3912fa4 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, int *ubused, int *stat); -struct xfs_bulkstat_agichunk { - xfs_ino_t ac_lastino; /* last inode returned */ - char __user **ac_ubuffer;/* pointer into user's buffer */ - int ac_ubleft; /* bytes left in user's buffer */ - int ac_ubelem; /* spaces used in user's buffer */ -}; - -int -xfs_bulkstat_ag_ichunk( - struct xfs_mount *mp, - xfs_agnumber_t agno, - struct xfs_inobt_rec_incore *irbp, - bulkstat_one_pf formatter, - size_t statstruct_size, - struct xfs_bulkstat_agichunk *acp); - /* * Values for stat return value. */ -- cgit v1.2.3 From 2b831ac6bc87d3cbcbb1a8816827b6923403e461 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:30:58 +1100 Subject: xfs: bulkstat chunk-formatter has issues The loop construct has issues: - clustidx is completely unused, so remove it. - the loop tries to be smart by terminating when the "freecount" tells it that all inodes are free. Just drop it as in most cases we have to scan all inodes in the chunk anyway. - move the "user buffer left" condition check to the only point where we consume space int eh user buffer. - move the initialisation of agino out of the loop, leaving just a simple loop control logic using the clusteridx. Also, double handling of the user buffer variables leads to problems tracking the current state - use the cursor variables directly rather than keeping local copies and then having to update the cursor before returning. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 58 ++++++++++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 50a3e5995dd9..7ea2b113db1b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -283,59 +283,49 @@ xfs_bulkstat_ag_ichunk( xfs_ino_t *lastino) { char __user **ubufp = acp->ac_ubuffer; - int ubleft = acp->ac_ubleft; - int ubelem = acp->ac_ubelem; - int chunkidx, clustidx; + int chunkidx; int error = 0; xfs_agino_t agino; - for (agino = irbp->ir_startino, chunkidx = clustidx = 0; - XFS_BULKSTAT_UBLEFT(ubleft) && - irbp->ir_freecount < XFS_INODES_PER_CHUNK; - chunkidx++, clustidx++, agino++) { - int fmterror; /* bulkstat formatter result */ + agino = irbp->ir_startino; + for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; + chunkidx++, agino++) { + int fmterror; int ubused; xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino); - ASSERT(chunkidx < XFS_INODES_PER_CHUNK); - /* Skip if this inode is free */ if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { *lastino = ino; continue; } - /* - * Count used inodes as free so we can tell when the - * chunk is used up. - */ - irbp->ir_freecount++; - /* Get the inode and fill in a single buffer */ ubused = statstruct_size; - error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror); - if (fmterror == BULKSTAT_RV_NOTHING) { - if (error && error != -ENOENT && error != -EINVAL) { - ubleft = 0; - break; - } - *lastino = ino; - continue; - } - if (fmterror == BULKSTAT_RV_GIVEUP) { - ubleft = 0; + error = formatter(mp, ino, *ubufp, acp->ac_ubleft, + &ubused, &fmterror); + if (fmterror == BULKSTAT_RV_GIVEUP || + (error && error != -ENOENT && error != -EINVAL)) { + acp->ac_ubleft = 0; ASSERT(error); break; } - if (*ubufp) - *ubufp += ubused; - ubleft -= ubused; - ubelem++; + + /* be careful not to leak error if at end of chunk */ + if (fmterror == BULKSTAT_RV_NOTHING || error) { + *lastino = ino; + error = 0; + continue; + } + + *ubufp += ubused; + acp->ac_ubleft -= ubused; + acp->ac_ubelem++; *lastino = ino; - } - acp->ac_ubleft = ubleft; - acp->ac_ubelem = ubelem; + if (acp->ac_ubleft < statstruct_size) + break; + } return error; } -- cgit v1.2.3 From 6e57c542cb7e0e580eb53ae76a77875c7d92b4b1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:31:13 +1100 Subject: xfs: bulkstat main loop logic is a mess There are a bunch of variables tha tare more wildy scoped than they need to be, obfuscated user buffer checks and tortured "next inode" tracking. This all needs cleaning up to expose the real issues that need fixing. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 56 +++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7ea2b113db1b..acae3355ab22 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -348,30 +348,23 @@ xfs_bulkstat( xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ - int end_of_ag; /* set if we've seen the ag end */ - int error; /* error code */ - int icount; /* count of inodes good in irbuf */ size_t irbsize; /* size of irec buffer in bytes */ - xfs_ino_t ino; /* inode number (filesystem) */ - xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ - xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ xfs_ino_t lastino; /* last inode number returned */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int ubcount; /* size of user's buffer */ - int stat; struct xfs_bulkstat_agichunk ac; + int error = 0; /* * Get the last inode value, see if there's nothing to do. */ - ino = (xfs_ino_t)*lastinop; - lastino = ino; - agno = XFS_INO_TO_AGNO(mp, ino); - agino = XFS_INO_TO_AGINO(mp, ino); + lastino = *lastinop; + agno = XFS_INO_TO_AGNO(mp, lastino); + agino = XFS_INO_TO_AGINO(mp, lastino); if (agno >= mp->m_sb.sb_agcount || - ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + lastino != XFS_AGINO_TO_INO(mp, agno, agino)) { *done = 1; *ubcountp = 0; return 0; @@ -396,8 +389,13 @@ xfs_bulkstat( * inode returned; 0 means start of the allocation group. */ rval = 0; - while (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft) && agno < mp->m_sb.sb_agcount) { - cond_resched(); + while (agno < mp->m_sb.sb_agcount) { + struct xfs_inobt_rec_incore *irbp = irbuf; + struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf; + bool end_of_ag = false; + int icount = 0; + int stat; + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); if (error) break; @@ -407,10 +405,6 @@ xfs_bulkstat( */ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO); - irbp = irbuf; - irbufend = irbuf + nirbuf; - end_of_ag = 0; - icount = 0; if (agino > 0) { /* * In the middle of an allocation group, we need to get @@ -435,7 +429,7 @@ xfs_bulkstat( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat); } if (error || stat == 0) { - end_of_ag = 1; + end_of_ag = true; goto del_cursor; } @@ -448,7 +442,7 @@ xfs_bulkstat( error = xfs_inobt_get_rec(cur, &r, &stat); if (error || stat == 0) { - end_of_ag = 1; + end_of_ag = true; goto del_cursor; } @@ -470,7 +464,7 @@ xfs_bulkstat( agino = r.ir_startino + XFS_INODES_PER_CHUNK; error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { - end_of_ag = 1; + end_of_ag = true; goto del_cursor; } cond_resched(); @@ -491,7 +485,7 @@ del_cursor: */ irbufend = irbp; for (irbp = irbuf; - irbp < irbufend && XFS_BULKSTAT_UBLEFT(ac.ac_ubleft); + irbp < irbufend && ac.ac_ubleft >= statstruct_size; irbp++) { error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, formatter, statstruct_size, &ac, @@ -502,17 +496,15 @@ del_cursor: cond_resched(); } - /* - * Set up for the next loop iteration. - */ - if (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft)) { - if (end_of_ag) { - agno++; - agino = 0; - } else - agino = XFS_INO_TO_AGINO(mp, lastino); - } else + /* If we've run out of space, we are done */ + if (ac.ac_ubleft < statstruct_size) break; + + if (end_of_ag) { + agno++; + agino = 0; + } else + agino = XFS_INO_TO_AGINO(mp, lastino); } /* * Done, we're either out of filesystem or space to put the data. -- cgit v1.2.3 From febe3cbe38b0bc0a925906dc90e8d59048851f87 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:31:15 +1100 Subject: xfs: bulkstat error handling is broken The error propagation is a horror - xfs_bulkstat() returns a rval variable which is only set if there are formatter errors. Any sort of btree walk error or corruption will cause the bulkstat walk to terminate but will not pass an error back to userspace. Worse is the fact that formatter errors will also be ignored if any inodes were correctly formatted into the user buffer. Hence bulkstat can fail badly yet still report success to userspace. This causes significant issues with xfsdump not dumping everything in the filesystem yet reporting success. It's not until a restore fails that there is any indication that the dump was bad and tha bulkstat failed. This patch now triggers xfsdump to fail with bulkstat errors rather than silently missing files in the dump. This now causes bulkstat to fail when the lastino cookie does not fall inside an existing inode chunk. The pre-3.17 code tolerated that error by allowing the code to move to the next inode chunk as the agino target is guaranteed to fall into the next btree record. With the fixes up to this point in the series, xfsdump now passes on the troublesome filesystem image that exposes all these bugs. cc: Signed-off-by: Dave Chinner Reviewed-by: Brian Foster --- fs/xfs/xfs_itable.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index acae3355ab22..ff3f431671b9 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk( XFS_WANT_CORRUPTED_RETURN(stat == 1); /* Check if the record contains the inode in request */ - if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) - return -EINVAL; + if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { + *icount = 0; + return 0; + } idx = agino - irec->ir_startino + 1; if (idx < XFS_INODES_PER_CHUNK && @@ -352,7 +354,6 @@ xfs_bulkstat( xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_ino_t lastino; /* last inode number returned */ int nirbuf; /* size of irbuf */ - int rval; /* return value error code */ int ubcount; /* size of user's buffer */ struct xfs_bulkstat_agichunk ac; int error = 0; @@ -388,7 +389,6 @@ xfs_bulkstat( * Loop over the allocation groups, starting from the last * inode returned; 0 means start of the allocation group. */ - rval = 0; while (agno < mp->m_sb.sb_agcount) { struct xfs_inobt_rec_incore *irbp = irbuf; struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf; @@ -491,13 +491,16 @@ del_cursor: formatter, statstruct_size, &ac, &lastino); if (error) - rval = error; + break; cond_resched(); } - /* If we've run out of space, we are done */ - if (ac.ac_ubleft < statstruct_size) + /* + * If we've run out of space or had a formatting error, we + * are now done + */ + if (ac.ac_ubleft < statstruct_size || error) break; if (end_of_ag) { @@ -511,11 +514,17 @@ del_cursor: */ kmem_free(irbuf); *ubcountp = ac.ac_ubelem; + /* - * Found some inodes, return them now and return the error next time. + * We found some inodes, so clear the error status and return them. + * The lastino pointer will point directly at the inode that triggered + * any error that occurred, so on the next call the error will be + * triggered again and propagated to userspace as there will be no + * formatted inodes in the buffer. */ if (ac.ac_ubelem) - rval = 0; + error = 0; + if (agno >= mp->m_sb.sb_agcount) { /* * If we ran out of filesystem, mark lastino as off @@ -527,7 +536,7 @@ del_cursor: } else *lastinop = (xfs_ino_t)lastino; - return rval; + return error; } int -- cgit v1.2.3 From 002758992693ae63c04122603ea9261a0a58d728 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:33:52 +1100 Subject: xfs: track bulkstat progress by agino The bulkstat main loop progress is tracked by the "lastino" variable, which is a full 64 bit inode. However, the loop actually works on agno/agino pairs, and so there's a significant disconnect between the rest of the loop and the main cursor. Convert this to use the agino, and pass the agino into the chunk formatting function and convert it too. This gets rid of the inconsistency in the loop processing, and finally makes it simple for us to skip inodes at any point in the loop simply by incrementing the agino cursor. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 71 +++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ff3f431671b9..894924a5129b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -282,30 +282,31 @@ xfs_bulkstat_ag_ichunk( bulkstat_one_pf formatter, size_t statstruct_size, struct xfs_bulkstat_agichunk *acp, - xfs_ino_t *lastino) + xfs_agino_t *last_agino) { char __user **ubufp = acp->ac_ubuffer; int chunkidx; int error = 0; - xfs_agino_t agino; + xfs_agino_t agino = irbp->ir_startino; - agino = irbp->ir_startino; for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; chunkidx++, agino++) { int fmterror; int ubused; - xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino); + + /* inode won't fit in buffer, we are done */ + if (acp->ac_ubleft < statstruct_size) + break; /* Skip if this inode is free */ - if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { - *lastino = ino; + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) continue; - } /* Get the inode and fill in a single buffer */ ubused = statstruct_size; - error = formatter(mp, ino, *ubufp, acp->ac_ubleft, - &ubused, &fmterror); + error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino), + *ubufp, acp->ac_ubleft, &ubused, &fmterror); + if (fmterror == BULKSTAT_RV_GIVEUP || (error && error != -ENOENT && error != -EINVAL)) { acp->ac_ubleft = 0; @@ -315,7 +316,6 @@ xfs_bulkstat_ag_ichunk( /* be careful not to leak error if at end of chunk */ if (fmterror == BULKSTAT_RV_NOTHING || error) { - *lastino = ino; error = 0; continue; } @@ -323,12 +323,18 @@ xfs_bulkstat_ag_ichunk( *ubufp += ubused; acp->ac_ubleft -= ubused; acp->ac_ubelem++; - *lastino = ino; - - if (acp->ac_ubleft < statstruct_size) - break; } + /* + * Post-update *last_agino. At this point, agino will always point one + * inode past the last inode we processed successfully. Hence we + * substract that inode when setting the *last_agino cursor so that we + * return the correct cookie to userspace. On the next bulkstat call, + * the inode under the lastino cookie will be skipped as we have already + * processed it here. + */ + *last_agino = agino - 1; + return error; } @@ -352,7 +358,6 @@ xfs_bulkstat( xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ size_t irbsize; /* size of irec buffer in bytes */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ - xfs_ino_t lastino; /* last inode number returned */ int nirbuf; /* size of irbuf */ int ubcount; /* size of user's buffer */ struct xfs_bulkstat_agichunk ac; @@ -361,11 +366,10 @@ xfs_bulkstat( /* * Get the last inode value, see if there's nothing to do. */ - lastino = *lastinop; - agno = XFS_INO_TO_AGNO(mp, lastino); - agino = XFS_INO_TO_AGINO(mp, lastino); + agno = XFS_INO_TO_AGNO(mp, *lastinop); + agino = XFS_INO_TO_AGINO(mp, *lastinop); if (agno >= mp->m_sb.sb_agcount || - lastino != XFS_AGINO_TO_INO(mp, agno, agino)) { + *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) { *done = 1; *ubcountp = 0; return 0; @@ -420,7 +424,6 @@ xfs_bulkstat( irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; - agino = r.ir_startino + XFS_INODES_PER_CHUNK; } /* Increment to the next record */ error = xfs_btree_increment(cur, 0, &stat); @@ -458,10 +461,6 @@ xfs_bulkstat( irbp++; icount += XFS_INODES_PER_CHUNK - r.ir_freecount; } - /* - * Set agino to after this chunk and bump the cursor. - */ - agino = r.ir_startino + XFS_INODES_PER_CHUNK; error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { end_of_ag = true; @@ -481,7 +480,9 @@ del_cursor: if (error) break; /* - * Now format all the good inodes into the user's buffer. + * Now format all the good inodes into the user's buffer. The + * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer + * for the next loop iteration. */ irbufend = irbp; for (irbp = irbuf; @@ -489,7 +490,7 @@ del_cursor: irbp++) { error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, formatter, statstruct_size, &ac, - &lastino); + &agino); if (error) break; @@ -506,8 +507,7 @@ del_cursor: if (end_of_ag) { agno++; agino = 0; - } else - agino = XFS_INO_TO_AGINO(mp, lastino); + } } /* * Done, we're either out of filesystem or space to put the data. @@ -525,16 +525,13 @@ del_cursor: if (ac.ac_ubelem) error = 0; - if (agno >= mp->m_sb.sb_agcount) { - /* - * If we ran out of filesystem, mark lastino as off - * the end of the filesystem, so the next call - * will return immediately. - */ - *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0); + /* + * If we ran out of filesystem, lastino will point off the end of + * the filesystem so the next call will return immediately. + */ + *lastinop = XFS_AGINO_TO_INO(mp, agno, agino); + if (agno >= mp->m_sb.sb_agcount) *done = 1; - } else - *lastinop = (xfs_ino_t)lastino; return error; } -- cgit v1.2.3 From 8c393f9a721c30a030049a680e1bf896669bb279 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Wed, 5 Nov 2014 22:36:50 +0800 Subject: nfs: fix pnfs direct write memory leak For pNFS direct writes, layout driver may dynamically allocate ds_cinfo.buckets. So we need to take care to free them when freeing dreq. Ideally this needs to be done inside layout driver where ds_cinfo.buckets are allocated. But buckets are attached to dreq and reused across LD IO iterations. So I feel it's OK to free them in the generic layer. Cc: stable@vger.kernel.org [v3.4+] Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 1 + include/linux/nfs_xdr.h | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 20cffc830468..10bf07280f4a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -266,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo); if (dreq->l_ctx != NULL) nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 983876f24aed..47ebb4fafd87 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1224,11 +1224,22 @@ struct nfs41_free_stateid_res { unsigned int status; }; +static inline void +nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo) +{ + kfree(cinfo->buckets); +} + #else struct pnfs_ds_commit_info { }; +static inline void +nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo) +{ +} + #endif /* CONFIG_NFS_V4_1 */ #ifdef CONFIG_NFS_V4_2 -- cgit v1.2.3 From e0d4ed71ca0344494722a041780f004d2bcf0f11 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Sep 2014 16:02:50 +0200 Subject: pnfs/blocklayout: serialize GETDEVICEINFO calls The rpc_pipefs code isn't thread safe, leading to occasional use after frees when running xfstests generic/241 (dbench). Signed-off-by: Christoph Hellwig Link: http://lkml.kernel.org/r/1411740170-18611-2-git-send-email-hch@lst.de Cc: stable@vger.kernel.org # 3.17.x Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/rpc_pipefs.c | 14 +++++++++----- fs/nfs/netns.h | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c index e966c023b1b7..acbf9ca4018c 100644 --- a/fs/nfs/blocklayout/rpc_pipefs.c +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -65,17 +65,18 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + mutex_lock(&nn->bl_mutex); bl_pipe_msg.bl_wq = &nn->bl_wq; b->simple.len += 4; /* single volume */ if (b->simple.len > PAGE_SIZE) - return -EIO; + goto out_unlock; memset(msg, 0, sizeof(*msg)); msg->len = sizeof(*bl_msg) + b->simple.len; msg->data = kzalloc(msg->len, gfp_mask); if (!msg->data) - goto out; + goto out_free_data; bl_msg = msg->data; bl_msg->type = BL_DEVICE_MOUNT, @@ -87,7 +88,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, rc = rpc_queue_upcall(nn->bl_device_pipe, msg); if (rc < 0) { remove_wait_queue(&nn->bl_wq, &wq); - goto out; + goto out_free_data; } set_current_state(TASK_UNINTERRUPTIBLE); @@ -97,12 +98,14 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, if (reply->status != BL_DEVICE_REQUEST_PROC) { printk(KERN_WARNING "%s failed to decode device: %d\n", __func__, reply->status); - goto out; + goto out_free_data; } dev = MKDEV(reply->major, reply->minor); -out: +out_free_data: kfree(msg->data); +out_unlock: + mutex_unlock(&nn->bl_mutex); return dev; } @@ -232,6 +235,7 @@ static int nfs4blocklayout_net_init(struct net *net) struct nfs_net *nn = net_generic(net, nfs_net_id); struct dentry *dentry; + mutex_init(&nn->bl_mutex); init_waitqueue_head(&nn->bl_wq); nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); if (IS_ERR(nn->bl_device_pipe)) diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index ef221fb8a183..f0e06e4acbef 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -19,6 +19,7 @@ struct nfs_net { struct rpc_pipe *bl_device_pipe; struct bl_dev_msg bl_mount_reply; wait_queue_head_t bl_wq; + struct mutex bl_mutex; struct list_head nfs_client_list; struct list_head nfs_volume_list; #if IS_ENABLED(CONFIG_NFS_V4) -- cgit v1.2.3 From 16c9914069536c77ed358d94b6e247bdc464b7f0 Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Mon, 3 Nov 2014 15:19:45 -0500 Subject: nfs: remove spurious WARN_ON_ONCE in write path This WARN_ON_ONCE was supposed to catch reference counting bugs, but can trigger in inappropriate situations. This was reproducible using NFSv2 on an architecture with 64K pages -- we verified that it was not a reference counting bug and the warning was safe to ignore. Reported-by: Will Deacon Tested-by: Will Deacon Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 12493846a2d3..f83b02dc9166 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -715,8 +715,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) nfs_release_request(req); - else - WARN_ON_ONCE(1); } static void -- cgit v1.2.3 From b283f9445214d4d573906f919c70caccd27b74ea Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Oct 2014 13:32:10 +0200 Subject: nfs: Remove bogus assignment Commit 3a6fd1f004fc (pnfs/blocklayout: remove read-modify-write handling in bl_write_pagelist) introduced a bogus assignment pg_index = pg_index in variable initialization. AFAICS it's just a typo so remove it. Spotted by Coverity (id 1248711). CC: Christoph Hellwig Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 5228f201d3d5..4f46f7a05289 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -378,7 +378,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync) loff_t offset = header->args.offset; size_t count = header->args.count; struct page **pages = header->args.pages; - int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; + int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; unsigned int pg_len; struct blk_plug plug; int i; -- cgit v1.2.3 From 16caf5b6101d03335b386e77e9e14136f989be87 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Oct 2014 14:02:47 +0200 Subject: nfs: Fix use of uninitialized variable in nfs_getattr() Variable 'err' needn't be initialized when nfs_getattr() uses it to check whether it should call generic_fillattr() or not. That can result in spurious error returns. Initialize 'err' properly. Signed-off-by: Jan Kara Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6388a59f2add..00689a8a85e4 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -626,7 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; - int err; + int err = 0; trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ -- cgit v1.2.3 From e983120e923aa1c5d2aaf528331c298c88f3ab85 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 22 Oct 2014 15:53:10 -0400 Subject: NFS: SEEK is an NFS v4.2 feature Somehow the nfs_v4_1_minor_ops had the NFS_CAP_SEEK flag set, enabling SEEK over v4.1. This is wrong, and can make servers crash. Signed-off-by: Anna Schumaker Tested-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 41b8fcbfdadd..dca174ce8309 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8397,8 +8397,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 - | NFS_CAP_ATOMIC_OPEN_V1 - | NFS_CAP_SEEK, + | NFS_CAP_ATOMIC_OPEN_V1, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, @@ -8420,7 +8419,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 - | NFS_CAP_ATOMIC_OPEN_V1, + | NFS_CAP_ATOMIC_OPEN_V1 + | NFS_CAP_SEEK, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, -- cgit v1.2.3 From 4dfd4f7af0afd201706ad186352ca423b0f17d4b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Oct 2014 15:10:25 +0300 Subject: NFSv4: Ensure that we remove NFSv4.0 delegations when state has expired NFSv4.0 does not have TEST_STATEID/FREE_STATEID functionality, so unlike NFSv4.1, the recovery procedure when stateids have expired or have been revoked requires us to just forget the delegation. http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index dca174ce8309..bdd880bddba4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2109,6 +2109,28 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta return ret; } +static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state) +{ + nfs_remove_bad_delegation(state->inode); + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, &state->open_stateid); + write_sequnlock(&state->seqlock); + clear_bit(NFS_DELEGATED_STATE, &state->flags); +} + +static void nfs40_clear_delegation_stateid(struct nfs4_state *state) +{ + if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL) + nfs_finish_clear_delegation_stateid(state); +} + +static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state) +{ + /* NFSv4.0 doesn't allow for delegation recovery on open expire */ + nfs40_clear_delegation_stateid(state); + return nfs4_open_expired(sp, state); +} + #if defined(CONFIG_NFS_V4_1) static void nfs41_clear_delegation_stateid(struct nfs4_state *state) { @@ -8330,7 +8352,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = { static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = { .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE, .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE, - .recover_open = nfs4_open_expired, + .recover_open = nfs40_open_expired, .recover_lock = nfs4_lock_expired, .establish_clid = nfs4_init_clientid, }; -- cgit v1.2.3 From 0c116cadd94b16b30b1dd90d38b2784d9b39b01a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Nov 2014 14:44:49 -0500 Subject: NFSv4.1: nfs41_clear_delegation_stateid shouldn't trust NFS_DELEGATED_STATE This patch removes the assumption made previously, that we only need to check the delegation stateid when it matches the stateid on a cached open. If we believe that we hold a delegation for this file, then we must assume that its stateid may have been revoked or expired too. If we don't test it then our state recovery process may end up caching open/lock state in a situation where it should not. We therefore rename the function nfs41_clear_delegation_stateid as nfs41_check_delegation_stateid, and change it to always run through the delegation stateid test and recovery process as outlined in RFC5661. http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bdd880bddba4..3b98fe752ef8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2132,45 +2132,37 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st } #if defined(CONFIG_NFS_V4_1) -static void nfs41_clear_delegation_stateid(struct nfs4_state *state) +static void nfs41_check_delegation_stateid(struct nfs4_state *state) { struct nfs_server *server = NFS_SERVER(state->inode); - nfs4_stateid *stateid = &state->stateid; + nfs4_stateid stateid; struct nfs_delegation *delegation; - struct rpc_cred *cred = NULL; - int status = -NFS4ERR_BAD_STATEID; - - /* If a state reset has been done, test_stateid is unneeded */ - if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) - return; + struct rpc_cred *cred; + int status; /* Get the delegation credential for use by test/free_stateid */ rcu_read_lock(); delegation = rcu_dereference(NFS_I(state->inode)->delegation); - if (delegation != NULL && - nfs4_stateid_match(&delegation->stateid, stateid)) { - cred = get_rpccred(delegation->cred); - rcu_read_unlock(); - status = nfs41_test_stateid(server, stateid, cred); - trace_nfs4_test_delegation_stateid(state, NULL, status); - } else + if (delegation == NULL) { rcu_read_unlock(); + return; + } + + nfs4_stateid_copy(&stateid, &delegation->stateid); + cred = get_rpccred(delegation->cred); + rcu_read_unlock(); + status = nfs41_test_stateid(server, &stateid, cred); + trace_nfs4_test_delegation_stateid(state, NULL, status); if (status != NFS_OK) { /* Free the stateid unless the server explicitly * informs us the stateid is unrecognized. */ if (status != -NFS4ERR_BAD_STATEID) - nfs41_free_stateid(server, stateid, cred); - nfs_remove_bad_delegation(state->inode); - - write_seqlock(&state->seqlock); - nfs4_stateid_copy(&state->stateid, &state->open_stateid); - write_sequnlock(&state->seqlock); - clear_bit(NFS_DELEGATED_STATE, &state->flags); + nfs41_free_stateid(server, &stateid, cred); + nfs_finish_clear_delegation_stateid(state); } - if (cred != NULL) - put_rpccred(cred); + put_rpccred(cred); } /** @@ -2214,7 +2206,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st { int status; - nfs41_clear_delegation_stateid(state); + nfs41_check_delegation_stateid(state); status = nfs41_check_open_stateid(state); if (status != NFS_OK) status = nfs4_open_expired(sp, state); -- cgit v1.2.3 From 869f9dfa4d6d57b79e0afc3af14772c2a023eeb1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 10 Nov 2014 18:43:56 -0500 Subject: NFSv4: Fix races between nfs_remove_bad_delegation() and delegation return Any attempt to call nfs_remove_bad_delegation() while a delegation is being returned is currently a no-op. This means that we can end up looping forever in nfs_end_delegation_return() if something causes the delegation to be revoked. This patch adds a mechanism whereby the state recovery code can communicate to the delegation return code that the delegation is no longer valid and that it should not be used when reclaiming state. It also changes the return value for nfs4_handle_delegation_recall_error() to ensure that nfs_end_delegation_return() does not reattempt the lock reclaim before state recovery is done. http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 23 +++++++++++++++++++++-- fs/nfs/delegation.h | 1 + fs/nfs/nfs4proc.c | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 5853f53db732..e5f473d13e24 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -193,7 +193,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation * { int res = 0; - res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync); + if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + res = nfs4_proc_delegreturn(inode, + delegation->cred, + &delegation->stateid, + issync); nfs_free_delegation(delegation); return res; } @@ -380,11 +384,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_inode *nfsi = NFS_I(inode); - int err; + int err = 0; if (delegation == NULL) return 0; do { + if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + break; err = nfs_delegation_claim_opens(inode, &delegation->stateid); if (!issync || err != -EAGAIN) break; @@ -605,10 +611,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl rcu_read_unlock(); } +static void nfs_revoke_delegation(struct inode *inode) +{ + struct nfs_delegation *delegation; + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL) { + set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); + nfs_mark_return_delegation(NFS_SERVER(inode), delegation); + } + rcu_read_unlock(); +} + void nfs_remove_bad_delegation(struct inode *inode) { struct nfs_delegation *delegation; + nfs_revoke_delegation(inode); delegation = nfs_inode_detach_delegation(inode); if (delegation) { nfs_inode_find_state_and_recover(inode, &delegation->stateid); diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 5c1cce39297f..e3c20a3ccc93 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -31,6 +31,7 @@ enum { NFS_DELEGATION_RETURN_IF_CLOSED, NFS_DELEGATION_REFERENCED, NFS_DELEGATION_RETURNING, + NFS_DELEGATION_REVOKED, }; int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3b98fe752ef8..4b7166f4e1cf 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1654,7 +1654,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct nfs_inode_find_state_and_recover(state->inode, stateid); nfs4_schedule_stateid_recovery(server, state); - return 0; + return -EAGAIN; case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: set_bit(NFS_DELEGATED_STATE, &state->flags); -- cgit v1.2.3 From c606bb8857921d3ecf4d353942d6cc7e116cc75a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Oct 2014 15:15:13 +0300 Subject: NFSv4: Ensure that we call FREE_STATEID when NFSv4.x stateids are revoked NFSv4.x (x>0) requires us to call TEST_STATEID+FREE_STATEID if a stateid is revoked. We will currently fail to do this if the stateid is a delegation. http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 3 --- fs/nfs/nfs4proc.c | 8 -------- 2 files changed, 11 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 46fab1cb455a..7afb52f6a25a 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -145,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (state == NULL) - break; - nfs_remove_bad_delegation(state->inode); case -NFS4ERR_OPENMODE: if (state == NULL) break; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4b7166f4e1cf..69dc20a743f9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -370,11 +370,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) { - nfs_remove_bad_delegation(inode); - exception->retry = 1; - break; - } if (state == NULL) break; ret = nfs4_schedule_stateid_recovery(server, state); @@ -4844,9 +4839,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - if (state == NULL) - break; - nfs_remove_bad_delegation(state->inode); case -NFS4ERR_OPENMODE: if (state == NULL) break; -- cgit v1.2.3 From f8ebf7a8ca35dde321f0cd385fee6f1950609367 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 17 Oct 2014 23:02:52 +0300 Subject: NFS: Don't try to reclaim delegation open state if recovery failed If state recovery failed, then we should not attempt to reclaim delegated state. http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index e5f473d13e24..7f3f60641344 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -125,6 +125,8 @@ again: continue; if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) continue; + if (!nfs4_valid_open_stateid(state)) + continue; if (!nfs4_stateid_match(&state->stateid, stateid)) continue; get_nfs_open_context(ctx); -- cgit v1.2.3 From 3231300bb986947a6b74e7075d84a2f434e4d788 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 22 Oct 2014 17:13:26 -0700 Subject: ceph: fix flush tid comparision TID of cap flush ack is 64 bits, but ceph_inode_info::flushing_cap_tid is only 16 bits. 16 bits should be plenty to let the cap flush updates pipeline appropriately, but we need to cast in the proper direction when comparing these differently-sized versions. So downcast the 64-bits one to 16 bits. Reflects ceph.git commit a5184cf46a6e867287e24aeb731634828467cd98. Signed-off-by: Yan, Zheng Reviewed-by: Ilya Dryomov --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 659f2ea9e6f7..cefca661464b 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2638,7 +2638,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, for (i = 0; i < CEPH_CAP_BITS; i++) if ((dirty & (1 << i)) && - flush_tid == ci->i_cap_flush_tid[i]) + (u16)flush_tid == ci->i_cap_flush_tid[i]) cleaned |= 1 << i; dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," -- cgit v1.2.3 From 8edc6e1688fc8f02c8c1f53a2ec4928cb1055f4d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 13 Nov 2014 15:19:33 -0800 Subject: fanotify: fix notification of groups with inode & mount marks fsnotify() needs to merge inode and mount marks lists when notifying groups about events so that ignore masks from inode marks are reflected in mount mark notifications and groups are notified in proper order (according to priorities). Currently the sorting of the lists done by fsnotify_add_inode_mark() / fsnotify_add_vfsmount_mark() and fsnotify() differed which resulted ignore masks not being used in some cases. Fix the problem by always using the same comparison function when sorting / merging the mark lists. Thanks to Heinrich Schuchardt for improvements of my patch. Link: https://bugzilla.kernel.org/show_bug.cgi?id=87721 Signed-off-by: Jan Kara Reported-by: Heinrich Schuchardt Tested-by: Heinrich Schuchardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fsnotify.c | 36 +++++++++++++++++++++--------------- fs/notify/fsnotify.h | 4 ++++ fs/notify/inode_mark.c | 8 +++----- fs/notify/mark.c | 36 ++++++++++++++++++++++++++++++++++++ fs/notify/vfsmount_mark.c | 8 +++----- 5 files changed, 67 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 9d3e9c50066a..89326acd4561 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -229,8 +229,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, &fsnotify_mark_srcu); } + /* + * We need to merge inode & vfsmount mark lists so that inode mark + * ignore masks are properly reflected for mount mark notifications. + * That's why this traversal is so complicated... + */ while (inode_node || vfsmount_node) { - inode_group = vfsmount_group = NULL; + inode_group = NULL; + inode_mark = NULL; + vfsmount_group = NULL; + vfsmount_mark = NULL; if (inode_node) { inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), @@ -244,21 +252,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, vfsmount_group = vfsmount_mark->group; } - if (inode_group > vfsmount_group) { - /* handle inode */ - ret = send_to_group(to_tell, inode_mark, NULL, mask, - data, data_is, cookie, file_name); - /* we didn't use the vfsmount_mark */ - vfsmount_group = NULL; - } else if (vfsmount_group > inode_group) { - ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, - data, data_is, cookie, file_name); - inode_group = NULL; - } else { - ret = send_to_group(to_tell, inode_mark, vfsmount_mark, - mask, data, data_is, cookie, - file_name); + if (inode_group && vfsmount_group) { + int cmp = fsnotify_compare_groups(inode_group, + vfsmount_group); + if (cmp > 0) { + inode_group = NULL; + inode_mark = NULL; + } else if (cmp < 0) { + vfsmount_group = NULL; + vfsmount_mark = NULL; + } } + ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask, + data, data_is, cookie, file_name); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 9c0898c4cfe1..3b68b0ae0a97 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -12,6 +12,10 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; +/* compare two groups for sorting of marks lists */ +extern int fsnotify_compare_groups(struct fsnotify_group *a, + struct fsnotify_group *b); + extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, __u32 mask); /* add a mark to an inode */ diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index e8497144b323..dfbf5447eea4 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -194,6 +194,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, { struct fsnotify_mark *lmark, *last = NULL; int ret = 0; + int cmp; mark->flags |= FSNOTIFY_MARK_FLAG_INODE; @@ -219,11 +220,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, goto out; } - if (mark->group->priority < lmark->group->priority) - continue; - - if ((mark->group->priority == lmark->group->priority) && - (mark->group < lmark->group)) + cmp = fsnotify_compare_groups(lmark->group, mark->group); + if (cmp < 0) continue; hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d90deaa08e78..34c38fabf514 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -209,6 +209,42 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas mark->ignored_mask = mask; } +/* + * Sorting function for lists of fsnotify marks. + * + * Fanotify supports different notification classes (reflected as priority of + * notification group). Events shall be passed to notification groups in + * decreasing priority order. To achieve this marks in notification lists for + * inodes and vfsmounts are sorted so that priorities of corresponding groups + * are descending. + * + * Furthermore correct handling of the ignore mask requires processing inode + * and vfsmount marks of each group together. Using the group address as + * further sort criterion provides a unique sorting order and thus we can + * merge inode and vfsmount lists of marks in linear time and find groups + * present in both lists. + * + * A return value of 1 signifies that b has priority over a. + * A return value of 0 signifies that the two marks have to be handled together. + * A return value of -1 signifies that a has priority over b. + */ +int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) +{ + if (a == b) + return 0; + if (!a) + return 1; + if (!b) + return -1; + if (a->priority < b->priority) + return 1; + if (a->priority > b->priority) + return -1; + if (a < b) + return 1; + return -1; +} + /* * Attach an initialized mark to a given group and fs object. * These marks may be used for the fsnotify backend to determine which diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index ac851e8376b1..faefa72a11eb 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -153,6 +153,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, struct mount *m = real_mount(mnt); struct fsnotify_mark *lmark, *last = NULL; int ret = 0; + int cmp; mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; @@ -178,11 +179,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, goto out; } - if (mark->group->priority < lmark->group->priority) - continue; - - if ((mark->group->priority == lmark->group->priority) && - (mark->group < lmark->group)) + cmp = fsnotify_compare_groups(lmark->group, mark->group); + if (cmp < 0) continue; hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); -- cgit v1.2.3 From 4a7795d35e252f38298980530e01e21867f8f856 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 19 Nov 2014 15:50:34 +0800 Subject: vfs: fix reference leak in d_prune_aliases() In "d_prune_alias(): just lock the parent and call __dentry_kill()" the old dget + d_drop + dput has been replaced with lock_parent + __dentry_kill; unfortunately, dput() does more than just killing dentry - it also drops the reference to parent. New variant leaks that reference and needs dput(parent) after killing the child off. Signed-off-by: Yan, Zheng Signed-off-by: Al Viro --- fs/dcache.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 3ffef7f4e5cd..5bc72b07fde2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -778,6 +778,7 @@ restart: struct dentry *parent = lock_parent(dentry); if (likely(!dentry->d_lockref.count)) { __dentry_kill(dentry); + dput(parent); goto restart; } if (parent) -- cgit v1.2.3 From 7ca2f234404e738cc807ed87c43f9932513bc8c6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Nov 2014 15:11:24 +0100 Subject: isofs: avoid unused function warning With the isofs_hash() function removed, isofs_hash_ms() is the only user of isofs_hash_common(), but it's defined inside of an #ifdef, which triggers this gcc warning in ARM axm55xx_defconfig starting with v3.18-rc3: fs/isofs/inode.c:177:1: warning: 'isofs_hash_common' defined but not used [-Wunused-function] This patch moves the function inside of the same #ifdef section to avoid that warning, which seems the best compromise of a relatively harmless patch for a late -rc. Signed-off-by: Arnd Bergmann Fixes: b0afd8e5db7b ("isofs: don't bother with ->d_op for normal case") Signed-off-by: Al Viro --- fs/isofs/inode.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index fe839b915116..d67a16f2a45d 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -170,27 +170,6 @@ struct iso9660_options{ s32 sbsector; }; -/* - * Compute the hash for the isofs name corresponding to the dentry. - */ -static int -isofs_hash_common(struct qstr *qstr, int ms) -{ - const char *name; - int len; - - len = qstr->len; - name = qstr->name; - if (ms) { - while (len && name[len-1] == '.') - len--; - } - - qstr->hash = full_name_hash(name, len); - - return 0; -} - /* * Compute the hash for the isofs name corresponding to the dentry. */ @@ -263,6 +242,27 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry, } #ifdef CONFIG_JOLIET +/* + * Compute the hash for the isofs name corresponding to the dentry. + */ +static int +isofs_hash_common(struct qstr *qstr, int ms) +{ + const char *name; + int len; + + len = qstr->len; + name = qstr->name; + if (ms) { + while (len && name[len-1] == '.') + len--; + } + + qstr->hash = full_name_hash(name, len); + + return 0; +} + static int isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr) { -- cgit v1.2.3 From f82c458a2c3ffb94b431fc6ad791a79df1b3713e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 19 Nov 2014 10:25:09 -0800 Subject: btrfs: fix lockups from btrfs_clear_path_blocking The fair reader/writer locks mean that btrfs_clear_path_blocking needs to strictly follow lock ordering rules even when we already have blocking locks on a given path. Before we can clear a blocking lock on the path, we need to make sure all of the locks have been converted to blocking. This will remove lock inversions against anyone spinning in write_lock() against the buffers we're trying to get read locks on. These inversions didn't exist before the fair read/writer locks, but now we need to be more careful. We papered over this deadlock in the past by changing btrfs_try_read_lock() to be a true trylock against both the spinlock and the blocking lock. This was slower, and not sufficient to fix all the deadlocks. This patch adds a btrfs_tree_read_lock_atomic(), which basically means get the spinlock but trylock on the blocking lock. Signed-off-by: Chris Mason Signed-off-by: Josef Bacik Reported-by: Patrick Schmid cc: stable@vger.kernel.org #v3.15+ --- fs/btrfs/ctree.c | 14 ++------------ fs/btrfs/locking.c | 24 +++++++++++++++++++++--- fs/btrfs/locking.h | 2 ++ 3 files changed, 25 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 19bc6162fb8e..150822ee0a0b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -80,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, { int i; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* lockdep really cares that we take all of these spinlocks - * in the right order. If any of the locks in the path are not - * currently blocking, it is going to complain. So, make really - * really sure by forcing the path to blocking before we clear - * the path blocking. - */ if (held) { btrfs_set_lock_blocking_rw(held, held_rw); if (held_rw == BTRFS_WRITE_LOCK) @@ -95,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, held_rw = BTRFS_READ_LOCK_BLOCKING; } btrfs_set_path_blocking(p); -#endif for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { if (p->nodes[i] && p->locks[i]) { @@ -107,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, } } -#ifdef CONFIG_DEBUG_LOCK_ALLOC if (held) btrfs_clear_lock_blocking_rw(held, held_rw); -#endif } /* this also releases the path */ @@ -2893,7 +2883,7 @@ cow_done: } p->locks[level] = BTRFS_WRITE_LOCK; } else { - err = btrfs_try_tree_read_lock(b); + err = btrfs_tree_read_lock_atomic(b); if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); @@ -3025,7 +3015,7 @@ again: } level = btrfs_header_level(b); - err = btrfs_try_tree_read_lock(b); + err = btrfs_tree_read_lock_atomic(b); if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5665d2149249..f8229ef1b46d 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -127,6 +127,26 @@ again: atomic_inc(&eb->spinning_readers); } +/* + * take a spinning read lock. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers + */ +int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) +{ + if (atomic_read(&eb->blocking_writers)) + return 0; + + read_lock(&eb->lock); + if (atomic_read(&eb->blocking_writers)) { + read_unlock(&eb->lock); + return 0; + } + atomic_inc(&eb->read_locks); + atomic_inc(&eb->spinning_readers); + return 1; +} + /* * returns 1 if we get the read lock and 0 if we don't * this won't wait for blocking writers @@ -158,9 +178,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) atomic_read(&eb->blocking_readers)) return 0; - if (!write_trylock(&eb->lock)) - return 0; - + write_lock(&eb->lock); if (atomic_read(&eb->blocking_writers) || atomic_read(&eb->blocking_readers)) { write_unlock(&eb->lock); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index b81e0e9a4894..c44a9d5f5362 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -35,6 +35,8 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); +int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); + static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { -- cgit v1.2.3 From ef94b1864d1ed5be54376404bb23d22ed0481feb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:39:59 +0100 Subject: ovl: rename filesystem type to "overlay" Some distributions carry an "old" format of overlayfs while mainline has a "new" format. The distros will possibly want to keep the old overlayfs alongside the new for compatibility reasons. To make it possible to differentiate the two versions change the name of the new one from "overlayfs" to "overlay". Signed-off-by: Miklos Szeredi Reported-by: Serge Hallyn Cc: Andy Whitcroft --- Documentation/filesystems/overlayfs.txt | 2 +- MAINTAINERS | 2 +- fs/Makefile | 2 +- fs/overlayfs/Kconfig | 2 +- fs/overlayfs/Makefile | 4 ++-- fs/overlayfs/super.c | 6 +++--- 6 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 530850a72735..a27c950ece61 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -64,7 +64,7 @@ is formed. At mount time, the two directories given as mount options "lowerdir" and "upperdir" are combined into a merged directory: - mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper,\ + mount -t overlay overlay -olowerdir=/lower,upperdir=/upper,\ workdir=/work /merged The "workdir" needs to be an empty directory on the same filesystem diff --git a/MAINTAINERS b/MAINTAINERS index c444907ccd69..af1e738e8772 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6888,7 +6888,7 @@ F: drivers/scsi/osd/ F: include/scsi/osd_* F: fs/exofs/ -OVERLAYFS FILESYSTEM +OVERLAY FILESYSTEM M: Miklos Szeredi L: linux-fsdevel@vger.kernel.org S: Supported diff --git a/fs/Makefile b/fs/Makefile index 34a1b9dea6dd..da0bbb456d3f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -104,7 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ -obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ +obj-$(CONFIG_OVERLAY_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index e60125976873..34355818a2e0 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -1,4 +1,4 @@ -config OVERLAYFS_FS +config OVERLAY_FS tristate "Overlay filesystem support" help An overlay filesystem combines two filesystems - an 'upper' filesystem diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 8f91889480d0..900daed3e91d 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -2,6 +2,6 @@ # Makefile for the overlay filesystem. # -obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o +obj-$(CONFIG_OVERLAY_FS) += overlay.o -overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o +overlay-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 08b704cebfc4..b92bd1829cf7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -24,7 +24,7 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); -#define OVERLAYFS_SUPER_MAGIC 0x794c764f +#define OVERLAYFS_SUPER_MAGIC 0x794c7630 struct ovl_config { char *lowerdir; @@ -776,11 +776,11 @@ static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, static struct file_system_type ovl_fs_type = { .owner = THIS_MODULE, - .name = "overlayfs", + .name = "overlay", .mount = ovl_mount, .kill_sb = kill_anon_super, }; -MODULE_ALIAS_FS("overlayfs"); +MODULE_ALIAS_FS("overlay"); static int __init ovl_init(void) { -- cgit v1.2.3 From a105d685a8483985a01776411de191a726b48132 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:39:59 +0100 Subject: ovl: fix remove/copy-up race ovl_remove_and_whiteout() needs to check if upper dentry exists or not after having locked upper parent directory. Previously we used a "type" value computed before locking the upper parent directory, which is susceptible to racing with copy-up. There's a similar check in ovl_check_empty_and_clear(). This one is not actually racy, since copy-up doesn't change the "emptyness" property of a directory. Add a comment to this effect, and check the existence of upper dentry locally to make the code cleaner. Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 15cd91ad9940..8ffc4b980f1b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -284,8 +284,7 @@ out: return ERR_PTR(err); } -static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, - enum ovl_path_type type) +static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) { int err; struct dentry *ret = NULL; @@ -294,8 +293,17 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, err = ovl_check_empty_dir(dentry, &list); if (err) ret = ERR_PTR(err); - else if (type == OVL_PATH_MERGE) - ret = ovl_clear_empty(dentry, &list); + else { + /* + * If no upperdentry then skip clearing whiteouts. + * + * Can race with copy-up, since we don't hold the upperdir + * mutex. Doesn't matter, since copy-up can't create a + * non-empty directory from an empty one. + */ + if (ovl_dentry_upper(dentry)) + ret = ovl_clear_empty(dentry, &list); + } ovl_cache_free(&list); @@ -487,8 +495,7 @@ out: return err; } -static int ovl_remove_and_whiteout(struct dentry *dentry, - enum ovl_path_type type, bool is_dir) +static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) { struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; @@ -500,7 +507,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, int err; if (is_dir) { - opaquedir = ovl_check_empty_and_clear(dentry, type); + opaquedir = ovl_check_empty_and_clear(dentry); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out; @@ -515,9 +522,10 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, if (IS_ERR(whiteout)) goto out_unlock; - if (type == OVL_PATH_LOWER) { + upper = ovl_dentry_upper(dentry); + if (!upper) { upper = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) goto kill_whiteout; @@ -529,7 +537,6 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, } else { int flags = 0; - upper = ovl_dentry_upper(dentry); if (opaquedir) upper = opaquedir; err = -ESTALE; @@ -648,7 +655,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) cap_raise(override_cred->cap_effective, CAP_CHOWN); old_cred = override_creds(override_cred); - err = ovl_remove_and_whiteout(dentry, type, is_dir); + err = ovl_remove_and_whiteout(dentry, is_dir); revert_creds(old_cred); put_cred(override_cred); @@ -781,7 +788,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, } if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { - opaquedir = ovl_check_empty_and_clear(new, new_type); + opaquedir = ovl_check_empty_and_clear(new); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { opaquedir = NULL; -- cgit v1.2.3 From 521484639ec19a6f1ed56de6993feb255f5f676c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:00 +0100 Subject: ovl: fix race in private xattr checks Xattr operations can race with copy up. This does not matter as long as we consistently fiter out "trunsted.overlay.opaque" attribute on upper directories. Previously we checked parent against OVL_PATH_MERGE. This is too general, and prone to race with copy-up. I.e. we found the parent to be on the lower layer but ovl_dentry_real() would return the copied-up dentry, possibly with the "opaque" attribute. So instead use ovl_path_real() and decide to filter the attributes based on the actual type of the dentry we'll use. Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index af2d18c9fcee..07d74b24913b 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -235,26 +235,36 @@ out: return err; } +static bool ovl_need_xattr_filter(struct dentry *dentry, + enum ovl_path_type type) +{ + return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode); +} + ssize_t ovl_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { - if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && - ovl_is_private_xattr(name)) + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); + + if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) return -ENODATA; - return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); + return vfs_getxattr(realpath.dentry, name, value, size); } ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) { + struct path realpath; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); ssize_t res; int off; - res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + res = vfs_listxattr(realpath.dentry, list, size); if (res <= 0 || size == 0) return res; - if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + if (!ovl_need_xattr_filter(dentry, type)) return res; /* filter out private xattrs */ @@ -279,17 +289,16 @@ int ovl_removexattr(struct dentry *dentry, const char *name) { int err; struct path realpath; - enum ovl_path_type type; + enum ovl_path_type type = ovl_path_real(dentry, &realpath); err = ovl_want_write(dentry); if (err) goto out; - if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && - ovl_is_private_xattr(name)) + err = -ENODATA; + if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name)) goto out_drop_write; - type = ovl_path_real(dentry, &realpath); if (type == OVL_PATH_LOWER) { err = vfs_getxattr(realpath.dentry, name, NULL, 0); if (err < 0) -- cgit v1.2.3 From 91c77947133f7aef851b625701e182d3f99d14a9 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:00 +0100 Subject: ovl: allow filenames with comma Allow option separator (comma) to be escaped with backslash. Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b92bd1829cf7..eee7a62e1c0e 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -462,11 +462,34 @@ static const match_table_t ovl_tokens = { {OPT_ERR, NULL} }; +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; - while ((p = strsep(&opt, ",")) != NULL) { + while ((p = ovl_next_opt(&opt)) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -554,15 +577,34 @@ out_dput: goto out_unlock; } +static void ovl_unescape(char *s) +{ + char *d = s; + + for (;; s++, d++) { + if (*s == '\\') + s++; + *d = *s; + if (!*s) + break; + } +} + static int ovl_mount_dir(const char *name, struct path *path) { int err; + char *tmp = kstrdup(name, GFP_KERNEL); + + if (!tmp) + return -ENOMEM; - err = kern_path(name, LOOKUP_FOLLOW, path); + ovl_unescape(tmp); + err = kern_path(tmp, LOOKUP_FOLLOW, path); if (err) { - pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err); err = -EINVAL; } + kfree(tmp); return err; } -- cgit v1.2.3 From 71d509280f7e92eb60ae6b7c78c20afafff060c7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:01 +0100 Subject: ovl: use lockless_dereference() for upperdentry Don't open code lockless_dereference() in ovl_upperdentry_dereference(). Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index eee7a62e1c0e..f16d318b71f8 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -84,12 +84,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) { - struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); - /* - * Make sure to order reads to upperdentry wrt ovl_dentry_update() - */ - smp_read_barrier_depends(); - return upperdentry; + return lockless_dereference(oe->__upperdentry); } void ovl_path_upper(struct dentry *dentry, struct path *path) -- cgit v1.2.3 From c9f00fdb9ab3999cb2fb582ad82a5db9e70c82f5 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:01 +0100 Subject: ovl: pass dentry into ovl_dir_read_merged() Pass dentry into ovl_dir_read_merged() insted of upperpath and lowerpath. This cleans up callers and paves the way for multi-layer directory reads. Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 2a7ef4f8e2a6..7299e962f334 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -274,11 +274,11 @@ static int ovl_dir_mark_whiteouts(struct dentry *dir, return 0; } -static inline int ovl_dir_read_merged(struct path *upperpath, - struct path *lowerpath, - struct list_head *list) +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) { int err; + struct path lowerpath; + struct path upperpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, @@ -286,25 +286,28 @@ static inline int ovl_dir_read_merged(struct path *upperpath, .is_merge = false, }; - if (upperpath->dentry) { - err = ovl_dir_read(upperpath, &rdd); + ovl_path_lower(dentry, &lowerpath); + ovl_path_upper(dentry, &upperpath); + + if (upperpath.dentry) { + err = ovl_dir_read(&upperpath, &rdd); if (err) goto out; - if (lowerpath->dentry) { - err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd); + if (lowerpath.dentry) { + err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd); if (err) goto out; } } - if (lowerpath->dentry) { + if (lowerpath.dentry) { /* * Insert lowerpath entries before upperpath ones, this allows * offsets to be reasonably constant */ list_add(&rdd.middle, rdd.list); rdd.is_merge = true; - err = ovl_dir_read(lowerpath, &rdd); + err = ovl_dir_read(&lowerpath, &rdd); list_del(&rdd.middle); } out: @@ -329,8 +332,6 @@ static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) { int res; - struct path lowerpath; - struct path upperpath; struct ovl_dir_cache *cache; cache = ovl_dir_cache(dentry); @@ -347,10 +348,7 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); - ovl_path_lower(dentry, &lowerpath); - ovl_path_upper(dentry, &upperpath); - - res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries); + res = ovl_dir_read_merged(dentry, &cache->entries); if (res) { ovl_cache_free(&cache->entries); kfree(cache); @@ -538,14 +536,9 @@ const struct file_operations ovl_dir_operations = { int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; - struct path lowerpath; - struct path upperpath; struct ovl_cache_entry *p; - ovl_path_upper(dentry, &upperpath); - ovl_path_lower(dentry, &lowerpath); - - err = ovl_dir_read_merged(&upperpath, &lowerpath, list); + err = ovl_dir_read_merged(dentry, list); if (err) return err; -- cgit v1.2.3 From 7676895f4736421ebafc48de5078e25ea69e88ee Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Nov 2014 16:40:02 +0100 Subject: ovl: ovl_dir_fsync() cleanup Check against !OVL_PATH_LOWER instead of OVL_PATH_MERGE. For a copied up directory the two are currently equivalent. Signed-off-by: Miklos Szeredi --- fs/overlayfs/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 7299e962f334..ab1e3dcbed95 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -450,10 +450,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, /* * Need to check if we started out being a lower dir, but got copied up */ - if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { + if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) { struct inode *inode = file_inode(file); - realfile =lockless_dereference(od->upperfile); + realfile = lockless_dereference(od->upperfile); if (!realfile) { struct path upperpath; -- cgit v1.2.3