From d1d84c9626bb3a519863b3ffc40d347166f9fb83 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 21 Aug 2014 15:04:31 -0400
Subject: nfsd4: fix response size estimation for OP_SEQUENCE

We added this new estimator function but forgot to hook it up.  The
effect is that NFSv4.1 (and greater) won't do zero-copy reads.

The estimate was also wrong by 8 bytes.

Fixes: ccae70a9ee41 "nfsd4: estimate sequence response size"
Cc: stable@vger.kernel.org
Reported-by: Chuck Lever <chucklever@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index cdeb3cfd6f32..f4bd578bed55 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1589,7 +1589,8 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
 static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
 				       struct nfsd4_op *op)
 {
-	return NFS4_MAX_SESSIONID_LEN + 20;
+	return (op_encode_hdr_size
+		+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
 }
 
 static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
@@ -1893,6 +1894,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_func = (nfsd4op_func)nfsd4_sequence,
 		.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
 		.op_name = "OP_SEQUENCE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_sequence_rsize,
 	},
 	[OP_DESTROY_CLIENTID] = {
 		.op_func = (nfsd4op_func)nfsd4_destroy_clientid,
-- 
cgit v1.2.3


From b744c2ac4bbc040794efb33207d6ebc14f88ea2e Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Tue, 21 Oct 2014 13:55:09 -0600
Subject: fs: merge I/O error prints into one line

buffer.c uses two printk calls to print these messages:
[67353.422338] Buffer I/O error on device sdr, logical block 212868488
[67353.422338] lost page write due to I/O error on sdr

In a busy system, they may be interleaved with other prints,
losing the context for the second message.  Merge them into
one line with one printk call so the prints are atomic.

Also, differentiate between async page writes, sync page writes, and
async page reads.

Also, shorten "device" to "dev" to match the block layer prints:
[67353.467906] blk_update_request: critical target error, dev sdr, sector
1707107328

Also, use %llu rather than %Lu.

Resulting prints look like:
[ 1356.437006] blk_update_request: critical target error, dev sdr, sector 1719693992
[ 1361.383522] quiet_error: 659876 callbacks suppressed
[ 1361.385816] Buffer I/O error on dev sdr, logical block 256902912, lost async page write
[ 1361.385819] Buffer I/O error on dev sdr, logical block 256903644, lost async page write

Signed-off-by: Robert Elliott <elliott@hp.com>
Reviewed-by: Webb Scales <webbnh@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/buffer.c | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 6c48f20eddd4..9d1da1d314a2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -137,12 +137,12 @@ static int quiet_error(struct buffer_head *bh)
 }
 
 
-static void buffer_io_error(struct buffer_head *bh)
+static void buffer_io_error(struct buffer_head *bh, char *msg)
 {
 	char b[BDEVNAME_SIZE];
-	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+	printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n",
 			bdevname(bh->b_bdev, b),
-			(unsigned long long)bh->b_blocknr);
+			(unsigned long long)bh->b_blocknr, msg);
 }
 
 /*
@@ -177,17 +177,11 @@ EXPORT_SYMBOL(end_buffer_read_sync);
 
 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
-
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!quiet_error(bh)) {
-			buffer_io_error(bh);
-			printk(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-				       bdevname(bh->b_bdev, b));
-		}
+		if (!quiet_error(bh))
+			buffer_io_error(bh, ", lost sync page write");
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 	}
@@ -305,7 +299,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	} else {
 		clear_buffer_uptodate(bh);
 		if (!quiet_error(bh))
-			buffer_io_error(bh);
+			buffer_io_error(bh, ", async page read");
 		SetPageError(page);
 	}
 
@@ -353,7 +347,6 @@ still_busy:
  */
 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
 	unsigned long flags;
 	struct buffer_head *first;
 	struct buffer_head *tmp;
@@ -365,12 +358,8 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!quiet_error(bh)) {
-			buffer_io_error(bh);
-			printk(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-			       bdevname(bh->b_bdev, b));
-		}
+		if (!quiet_error(bh))
+			buffer_io_error(bh, ", lost async page write");
 		set_bit(AS_EIO, &page->mapping->flags);
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
-- 
cgit v1.2.3


From 432f16e64f50fd4999a476543d04dd52f7a2d753 Mon Sep 17 00:00:00 2001
From: Robert Elliott <elliott@hp.com>
Date: Tue, 21 Oct 2014 13:55:11 -0600
Subject: fs: clarify rate limit suppressed buffer I/O errors

When quiet_error applies rate limiting to buffer_io_error calls, what the
they apply to is unclear because the name is so generic, particularly
if the messages are interleaved with others:

[ 1936.063572] quiet_error: 664293 callbacks suppressed
[ 1936.065297] Buffer I/O error on dev sdr, logical block 257429952, lost async page write
[ 1936.067814] Buffer I/O error on dev sdr, logical block 257429953, lost async page write

Also, the function uses printk_ratelimit(), although printk.h includes a
comment advising "Please don't use... Instead use printk_ratelimited()."

Change buffer_io_error to check the BH_Quiet bit itself, drop the
printk_ratelimit call, and print using printk_ratelimited.

This makes the messages look like:

[  387.208839] buffer_io_error: 676394 callbacks suppressed
[  387.210693] Buffer I/O error on dev sdr, logical block 211291776, lost async page write
[  387.213432] Buffer I/O error on dev sdr, logical block 211291777, lost async page write

Signed-off-by: Robert Elliott <elliott@hp.com>
Reviewed-by: Webb Scales <webbnh@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/buffer.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 9d1da1d314a2..20805db2c987 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -128,19 +128,13 @@ __clear_page_buffers(struct page *page)
 	page_cache_release(page);
 }
 
-
-static int quiet_error(struct buffer_head *bh)
-{
-	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
-		return 0;
-	return 1;
-}
-
-
 static void buffer_io_error(struct buffer_head *bh, char *msg)
 {
 	char b[BDEVNAME_SIZE];
-	printk(KERN_ERR "Buffer I/O error on dev %s, logical block %llu%s\n",
+
+	if (!test_bit(BH_Quiet, &bh->b_state))
+		printk_ratelimited(KERN_ERR
+			"Buffer I/O error on dev %s, logical block %llu%s\n",
 			bdevname(bh->b_bdev, b),
 			(unsigned long long)bh->b_blocknr, msg);
 }
@@ -180,8 +174,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!quiet_error(bh))
-			buffer_io_error(bh, ", lost sync page write");
+		buffer_io_error(bh, ", lost sync page write");
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
 	}
@@ -298,8 +291,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		set_buffer_uptodate(bh);
 	} else {
 		clear_buffer_uptodate(bh);
-		if (!quiet_error(bh))
-			buffer_io_error(bh, ", async page read");
+		buffer_io_error(bh, ", async page read");
 		SetPageError(page);
 	}
 
@@ -358,8 +350,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		if (!quiet_error(bh))
-			buffer_io_error(bh, ", lost async page write");
+		buffer_io_error(bh, ", lost async page write");
 		set_bit(AS_EIO, &page->mapping->flags);
 		set_buffer_write_io_error(bh);
 		clear_buffer_uptodate(bh);
-- 
cgit v1.2.3


From 7938db449bbc55bbeb164bec7af406212e7e98f1 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 16 Sep 2014 22:23:10 +0200
Subject: ext3: Don't check quota format when there are no quota files

The check whether quota format is set even though there are no
quota files with journalled quota is pointless and it actually
makes it impossible to turn off journalled quotas (as there's
no way to unset journalled quota format). Just remove the check.

CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7015db0bafd1..eb742d0e67ff 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1354,13 +1354,6 @@ set_qf_format:
 					"not specified.");
 			return 0;
 		}
-	} else {
-		if (sbi->s_jquota_fmt) {
-			ext3_msg(sb, KERN_ERR, "error: journaled quota format "
-					"specified with no journaling "
-					"enabled.");
-			return 0;
-		}
 	}
 #endif
 	return 1;
-- 
cgit v1.2.3


From 474d2605d119479e5aa050f738632e63589d4bb5 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 22 Oct 2014 09:06:49 +0200
Subject: quota: Properly return errors from dquot_writeback_dquots()

Due to a switched left and right side of an assignment,
dquot_writeback_dquots() never returned error. This could result in
errors during quota writeback to not be reported to userspace properly.
Fix it.

CC: stable@vger.kernel.org
Coverity-id: 1226884
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8b663b2d9562..6b4527216a7f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -634,7 +634,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
 			dqstats_inc(DQST_LOOKUPS);
 			err = sb->dq_op->write_dquot(dquot);
 			if (!ret && err)
-				err = ret;
+				ret = err;
 			dqput(dquot);
 			spin_lock(&dq_list_lock);
 		}
-- 
cgit v1.2.3


From 3c9cafe05ff002eb84d438a02f3c8d468720463b Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Tue, 21 Oct 2014 16:43:55 -0400
Subject: fs, jbd: use a more generic hash function

While the hash function used by the revoke hashtable is good somewhere else,
it's not really good here.

The default hash shift (8) means that one third of the hashing function
gets lost (and is undefined anyways (8 - 12 = negative shift)):

	"(block << (hash_shift - 12))) & (table->hash_size - 1)"

Instead, just use the kernel's generic hash function that gets used everywhere
else.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/revoke.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 8898bbd2b61e..dcead636c33b 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -93,6 +93,7 @@
 #include <linux/bio.h>
 #endif
 #include <linux/log2.h>
+#include <linux/hash.h>
 
 static struct kmem_cache *revoke_record_cache;
 static struct kmem_cache *revoke_table_cache;
@@ -129,15 +130,11 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 
 /* Utility functions to maintain the revoke table */
 
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
 static inline int hash(journal_t *journal, unsigned int block)
 {
 	struct jbd_revoke_table_s *table = journal->j_revoke;
-	int hash_shift = table->hash_shift;
 
-	return ((block << (hash_shift - 6)) ^
-		(block >> 13) ^
-		(block << (hash_shift - 12))) & (table->hash_size - 1);
+	return hash_32(block, table->hash_shift);
 }
 
 static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
-- 
cgit v1.2.3


From 51904b08072a8bf2b9ed74d1bd7a5300a614471d Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 22 Oct 2014 14:46:29 -0400
Subject: nfsd4: fix crash on unknown operation number

Unknown operation numbers are caught in nfsd4_decode_compound() which
sets op->opnum to OP_ILLEGAL and op->status to nfserr_op_illegal.  The
error causes the main loop in nfsd4_proc_compound() to skip most
processing.  But nfsd4_proc_compound also peeks ahead at the next
operation in one case and doesn't take similar precautions there.

Cc: stable@vger.kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f4bd578bed55..0beb023f25ac 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1272,7 +1272,8 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
 	 */
 	if (argp->opcnt == resp->opcnt)
 		return false;
-
+	if (next->opnum == OP_ILLEGAL)
+		return false;
 	nextd = OPDESC(next);
 	/*
 	 * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
-- 
cgit v1.2.3


From 21e7626b12f25770e2975bc7c7b2e1d5b1d58a57 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 27 Oct 2014 13:52:21 +0100
Subject: btrfs: use macro accessors in superblock validation checks

The initial patch c926093ec516f5d316 (btrfs: add more superblock checks)
did not properly use the macro accessors that wrap endianness and the
code would not work correctly on big endian machines.

Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/disk-io.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2409718e3f20..1ae1661ba14c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3817,19 +3817,19 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	struct btrfs_super_block *sb = fs_info->super_copy;
 	int ret = 0;
 
-	if (sb->root_level > BTRFS_MAX_LEVEL) {
-		printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
-				sb->root_level, BTRFS_MAX_LEVEL);
+	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
+				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
 		ret = -EINVAL;
 	}
-	if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
-		printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
-				sb->chunk_root_level, BTRFS_MAX_LEVEL);
+	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: chunk_root level too big: %d >= %d\n",
+				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
 		ret = -EINVAL;
 	}
-	if (sb->log_root_level > BTRFS_MAX_LEVEL) {
-		printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
-				sb->log_root_level, BTRFS_MAX_LEVEL);
+	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
+		printk(KERN_ERR "BTRFS: log_root level too big: %d >= %d\n",
+				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
 		ret = -EINVAL;
 	}
 
@@ -3837,15 +3837,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	 * The common minimum, we don't know if we can trust the nodesize/sectorsize
 	 * items yet, they'll be verified later. Issue just a warning.
 	 */
-	if (!IS_ALIGNED(sb->root, 4096))
+	if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
 				sb->root);
-	if (!IS_ALIGNED(sb->chunk_root, 4096))
+	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
 				sb->chunk_root);
-	if (!IS_ALIGNED(sb->log_root, 4096))
+	if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
-				sb->log_root);
+				btrfs_super_log_root(sb));
 
 	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
 		printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
@@ -3857,13 +3857,13 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
 	 * done later
 	 */
-	if (sb->num_devices > (1UL << 31))
+	if (btrfs_super_num_devices(sb) > (1UL << 31))
 		printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
-				sb->num_devices);
+				btrfs_super_num_devices(sb));
 
-	if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
+	if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
 		printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
-				sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
+				btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
 		ret = -EINVAL;
 	}
 
@@ -3871,14 +3871,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	 * The generation is a global counter, we'll trust it more than the others
 	 * but it's still possible that it's the one that's wrong.
 	 */
-	if (sb->generation < sb->chunk_root_generation)
+	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
 		printk(KERN_WARNING
 			"BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
-			sb->generation, sb->chunk_root_generation);
-	if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
+			btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb));
+	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
+	    && btrfs_super_cache_generation(sb) != (u64)-1)
 		printk(KERN_WARNING
 			"BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
-			sb->generation, sb->cache_generation);
+			btrfs_super_generation(sb), btrfs_super_cache_generation(sb));
 
 	return ret;
 }
-- 
cgit v1.2.3


From 1a4ed8fdca077d2489ec47d548451be69389e926 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 27 Oct 2014 10:44:24 +0000
Subject: Btrfs: fix invalid leaf slot access in btrfs_lookup_extent()

If we couldn't find our extent item, we accessed the current slot
(path->slots[0]) to check if it corresponds to an equivalent skinny
metadata item. However this slot could be beyond our last item in the
leaf (i.e. path->slots[0] >= btrfs_header_nritems(leaf)), in which case
we shouldn't process it.

Since btrfs_lookup_extent() is only used to find extent items for data
extents, fix this by removing completely the logic that looks up for an
equivalent skinny metadata item, since it can not exist.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent-tree.c | 10 ++--------
 fs/btrfs/tree-log.c    |  2 +-
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d557264ee974..fe69edda11fb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3276,7 +3276,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
 int btrfs_async_run_delayed_refs(struct btrfs_root *root,
 				 unsigned long count, int wait);
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 bytenr,
 			     u64 offset, int metadata, u64 *refs, u64 *flags);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0d599ba1aaed..87c0b46f8a7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -710,8 +710,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
 	rcu_read_unlock();
 }
 
-/* simple helper to search for an existing extent at a given offset */
-int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
+/* simple helper to search for an existing data extent at a given offset */
+int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
 {
 	int ret;
 	struct btrfs_key key;
@@ -726,12 +726,6 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
-	if (ret > 0) {
-		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
-		if (key.objectid == start &&
-		    key.type == BTRFS_METADATA_ITEM_KEY)
-			ret = 0;
-	}
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2b26dad35d88..6d58d72705ae 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -672,7 +672,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			 * is this extent already allocated in the extent
 			 * allocation tree?  If so, just add a reference
 			 */
-			ret = btrfs_lookup_extent(root, ins.objectid,
+			ret = btrfs_lookup_data_extent(root, ins.objectid,
 						ins.offset);
 			if (ret == 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
-- 
cgit v1.2.3


From 5ed5f5884116e3841da626d201ef068f23232a3a Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Wed, 15 Oct 2014 17:19:59 -0400
Subject: Btrfs: properly clean up btrfs_end_io_wq_cache

In one of Dave's cleanup commits he forgot to call btrfs_end_io_wq_exit on
unload, which makes us unable to unload and then re-load the btrfs module.  This
fixes the problem.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: David Sterba <dsterba@suse.cz>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index a2b97ef10317..54bd91ece35b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2151,6 +2151,7 @@ static void __exit exit_btrfs_fs(void)
 	extent_map_exit();
 	extent_io_exit();
 	btrfs_interface_exit();
+	btrfs_end_io_wq_exit();
 	unregister_filesystem(&btrfs_fs_type);
 	btrfs_exit_sysfs();
 	btrfs_cleanup_fs_uuids();
-- 
cgit v1.2.3


From d05a2b4cd97071462e77e6a7a8f109c36307182a Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 27 Oct 2014 09:19:52 +0000
Subject: Btrfs: fix race that makes btrfs_lookup_extent_info miss skinny
 extent items

We have a race that can lead us to miss skinny extent items in the function
btrfs_lookup_extent_info() when the skinny metadata feature is enabled.
So basically the sequence of steps is:

1) We search in the extent tree for the skinny extent, which returns > 0
   (not found);

2) We check the previous item in the returned leaf for a non-skinny extent,
   and we don't find it;

3) Because we didn't find the non-skinny extent in step 2), we release our
   path to search the extent tree again, but this time for a non-skinny
   extent key;

4) Right after we released our path in step 3), a skinny extent was inserted
   in the extent tree (delayed refs were run) - our second extent tree search
   will miss it, because it's not looking for a skinny extent;

5) After the second search returned (with ret > 0), we look for any delayed
   ref for our extent's bytenr (and we do it while holding a read lock on the
   leaf), but we won't find any, as such delayed ref had just run and completed
   after we released out path in step 3) before doing the second search.

Fix this by removing completely the path release and re-search logic. This is
safe, because if we seach for a metadata item and we don't find it, we have the
guarantee that the returned leaf is the one where the item would be inserted,
and so path->slots[0] > 0 and path->slots[0] - 1 must be the slot where the
non-skinny extent item is if it exists. The only case where path->slots[0] is
zero is when there are no smaller keys in the tree (i.e. no left siblings for
our leaf), in which case the re-search logic isn't needed as well.

This race has been present since the introduction of skinny metadata (change
3173a18f70554fe7880bb2d85c7da566e364eb3c).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/extent-tree.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 87c0b46f8a7e..a84e00da14f1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -780,7 +780,6 @@ search_again:
 	else
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 
-again:
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 				&key, path, 0, 0);
 	if (ret < 0)
@@ -796,13 +795,6 @@ again:
 			    key.offset == root->nodesize)
 				ret = 0;
 		}
-		if (ret) {
-			key.objectid = bytenr;
-			key.type = BTRFS_EXTENT_ITEM_KEY;
-			key.offset = root->nodesize;
-			btrfs_release_path(path);
-			goto again;
-		}
 	}
 
 	if (ret == 0) {
-- 
cgit v1.2.3


From a6bbce54efa9145dbcf3029c885549f7ebc40a3b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 29 Oct 2014 08:22:18 +1100
Subject: xfs: bulkstat doesn't release AGI buffer on error

The recent refactoring of the bulkstat code left a small landmine in
the code. If a inobt read fails, then the tree walk is aborted and
returns without releasing the AGI buffer or freeing the cursor. This
can lead to a subsequent bulkstat call hanging trying to grab the
AGI buffer again.

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f1deb961a296..ef8ea0589780 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -427,7 +427,7 @@ xfs_bulkstat(
 
 			error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
 			if (error)
-				break;
+				goto del_cursor;
 			if (icount) {
 				irbp->ir_startino = r.ir_startino;
 				irbp->ir_freecount = r.ir_freecount;
@@ -442,7 +442,7 @@ xfs_bulkstat(
 			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
 		}
 		if (error)
-			break;
+			goto del_cursor;
 
 		/*
 		 * Loop through inode btree records in this ag,
@@ -454,7 +454,7 @@ xfs_bulkstat(
 			error = xfs_inobt_get_rec(cur, &r, &i);
 			if (error || i == 0) {
 				end_of_ag = 1;
-				break;
+				goto del_cursor;
 			}
 
 			/*
@@ -476,13 +476,17 @@ xfs_bulkstat(
 			error = xfs_btree_increment(cur, 0, &tmp);
 			cond_resched();
 		}
+
 		/*
-		 * Drop the btree buffers and the agi buffer.
-		 * We can't hold any of the locks these represent
-		 * when calling iget.
+		 * Drop the btree buffers and the agi buffer as we can't hold any
+		 * of the locks these represent when calling iget. If there is a
+		 * pending error, then we are done.
 		 */
+del_cursor:
 		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 		xfs_buf_relse(agbp);
+		if (error)
+			break;
 		/*
 		 * Now format all the good inodes into the user's buffer.
 		 */
-- 
cgit v1.2.3


From 6424babfd68dd8a83d9c60a5242d27038856599f Mon Sep 17 00:00:00 2001
From: Jerry Hoemann <jerry.hoemann@hp.com>
Date: Wed, 29 Oct 2014 14:50:22 -0700
Subject: fsnotify: next_i is freed during fsnotify_unmount_inodes.

During file system stress testing on 3.10 and 3.12 based kernels, the
umount command occasionally hung in fsnotify_unmount_inodes in the
section of code:

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

As this section of code holds the global inode_sb_list_lock, eventually
the system hangs trying to acquire the lock.

Multiple crash dumps showed:

The inode->i_state == 0x60 and i_count == 0 and i_sb_list would point
back at itself.  As this is not the value of list upon entry to the
function, the kernel never exits the loop.

To help narrow down problem, the call to list_del_init in
inode_sb_list_del was changed to list_del.  This poisons the pointers in
the i_sb_list and causes a kernel to panic if it transverse a freed
inode.

Subsequent stress testing paniced in fsnotify_unmount_inodes at the
bottom of the list_for_each_entry_safe loop showing next_i had become
free.

We believe the root cause of the problem is that next_i is being freed
during the window of time that the list_for_each_entry_safe loop
temporarily releases inode_sb_list_lock to call fsnotify and
fsnotify_inode_delete.

The code in fsnotify_unmount_inodes attempts to prevent the freeing of
inode and next_i by calling __iget.  However, the code doesn't do the
__iget call on next_i

	if i_count == 0 or
	if i_state & (I_FREEING | I_WILL_FREE)

The patch addresses this issue by advancing next_i in the above two cases
until we either find a next_i which we can __iget or we reach the end of
the list.  This makes the handling of next_i more closely match the
handling of the variable "inode."

The time to reproduce the hang is highly variable (from hours to days.) We
ran the stress test on a 3.10 kernel with the proposed patch for a week
without failure.

During list_for_each_entry_safe, next_i is becoming free causing
the loop to never terminate.  Advance next_i in those cases where
__iget is not done.

Signed-off-by: Jerry Hoemann <jerry.hoemann@hp.com>
Cc: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Cc: Ken Helias <kenhelias@firemail.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inode_mark.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 9ce062218de9..e8497144b323 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -288,20 +288,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
 		spin_unlock(&inode->i_lock);
 
 		/* In case the dropping of a reference would nuke next_i. */
-		if ((&next_i->i_sb_list != list) &&
-		    atomic_read(&next_i->i_count)) {
+		while (&next_i->i_sb_list != list) {
 			spin_lock(&next_i->i_lock);
-			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
+			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
+						atomic_read(&next_i->i_count)) {
 				__iget(next_i);
 				need_iput = next_i;
+				spin_unlock(&next_i->i_lock);
+				break;
 			}
 			spin_unlock(&next_i->i_lock);
+			next_i = list_entry(next_i->i_sb_list.next,
+						struct inode, i_sb_list);
 		}
 
 		/*
-		 * We can safely drop inode_sb_list_lock here because we hold
-		 * references on both inode and next_i.  Also no new inodes
-		 * will be added since the umount has begun.
+		 * We can safely drop inode_sb_list_lock here because either
+		 * we actually hold references on both inode and next_i or
+		 * end of list.  Also no new inodes will be added since the
+		 * umount has begun.
 		 */
 		spin_unlock(&inode_sb_list_lock);
 
-- 
cgit v1.2.3


From d3556babd7facb8fbc596bada0d67139e3b22330 Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Wed, 29 Oct 2014 14:50:53 -0700
Subject: ocfs2: fix d_splice_alias() return code checking

d_splice_alias() can return a valid dentry, NULL or an ERR_PTR.
Currently the code checks not for ERR_PTR and will cuase an oops in
ocfs2_dentry_attach_lock().  Fix this by using IS_ERR_OR_NULL().

Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8add6f1030d7..b931e04e3388 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -158,7 +158,7 @@ bail_add:
 		 * NOTE: This dentry already has ->d_op set from
 		 * ocfs2_get_parent() and ocfs2_get_dentry()
 		 */
-		if (ret)
+		if (!IS_ERR_OR_NULL(ret))
 			dentry = ret;
 
 		status = ocfs2_dentry_attach_lock(dentry, inode,
-- 
cgit v1.2.3


From 7a19dee116c8fae7ba7a778043c245194289f5a2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:34:52 +1100
Subject: xfs: Check error during inode btree iteration in xfs_bulkstat()

xfs_bulkstat() doesn't check error return from xfs_btree_increment(). In
case of specific fs corruption that could result in xfs_bulkstat()
entering an infinite loop because we would be looping over the same
chunk over and over again. Fix the problem by checking the return value
and terminating the loop properly.

Coverity-id: 1231338
cc: <stable@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jie Liu <jeff.u.liu@gmail.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index ef8ea0589780..7765ff743e91 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -474,6 +474,10 @@ xfs_bulkstat(
 			 */
 			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			error = xfs_btree_increment(cur, 0, &tmp);
+			if (error) {
+				end_of_ag = 1;
+				goto del_cursor;
+			}
 			cond_resched();
 		}
 
-- 
cgit v1.2.3


From 5d11fb4b9a1d90983452c029b5e1377af78fda49 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 30 Oct 2014 10:35:11 +1100
Subject: xfs: rework zero range to prevent invalid i_size updates

The zero range operation is analogous to fallocate with the exception of
converting the range to zeroes. E.g., it attempts to allocate zeroed
blocks over the range specified by the caller. The XFS implementation
kills all delalloc blocks currently over the aligned range, converts the
range to allocated zero blocks (unwritten extents) and handles the
partial pages at the ends of the range by sending writes through the
pagecache.

The current implementation suffers from several problems associated with
inode size. If the aligned range covers an extending I/O, said I/O is
discarded and an inode size update from a previous write never makes it
to disk. Further, if an unaligned zero range extends beyond eof, the
page write induced for the partial end page can itself increase the
inode size, even if the zero range request is not supposed to update
i_size (via KEEP_SIZE, similar to an fallocate beyond EOF).

The latter behavior not only incorrectly increases the inode size, but
can lead to stray delalloc blocks on the inode. Typically, post-eof
preallocation blocks are either truncated on release or inode eviction
or explicitly written to by xfs_zero_eof() on natural file size
extension. If the inode size increases due to zero range, however,
associated blocks leak into the address space having never been
converted or mapped to pagecache pages. A direct I/O to such an
uncovered range cannot convert the extent via writeback and will BUG().
For example:

$ xfs_io -fc "pwrite 0 128k" -c "fzero -k 1m 54321" <file>
...
$ xfs_io -d -c "pread 128k 128k" <file>
<BUG>

If the entire delalloc extent happens to not have page coverage
whatsoever (e.g., delalloc conversion couldn't find a large enough free
space extent), even a full file writeback won't convert what's left of
the extent and we'll assert on inode eviction.

Rework xfs_zero_file_space() to avoid buffered I/O for partial pages.
Use the existing hole punch and prealloc mechanisms as primitives for
zero range. This implementation is not efficient nor ideal as we
writeback dirty data over the range and remove existing extents rather
than convert to unwrittern. The former writeback, however, is currently
the only mechanism available to ensure consistency between pagecache and
extent state. Even a pagecache truncate/delalloc punch prior to hole
punch has lead to inconsistencies due to racing with writeback.

This provides a consistent, correct implementation of zero range that
survives fsstress/fsx testing without assert failures. The
implementation can be optimized from this point forward once the
fundamental issue of pagecache and delalloc extent state consistency is
addressed.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_bmap_util.c | 72 ++++++++++++++------------------------------------
 1 file changed, 20 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 92e8f99a5857..281002689d64 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1338,7 +1338,10 @@ xfs_free_file_space(
 	goto out;
 }
 
-
+/*
+ * Preallocate and zero a range of a file. This mechanism has the allocation
+ * semantics of fallocate and in addition converts data in the range to zeroes.
+ */
 int
 xfs_zero_file_space(
 	struct xfs_inode	*ip,
@@ -1346,65 +1349,30 @@ xfs_zero_file_space(
 	xfs_off_t		len)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	uint			granularity;
-	xfs_off_t		start_boundary;
-	xfs_off_t		end_boundary;
+	uint			blksize;
 	int			error;
 
 	trace_xfs_zero_file_space(ip);
 
-	granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+	blksize = 1 << mp->m_sb.sb_blocklog;
 
 	/*
-	 * Round the range of extents we are going to convert inwards.  If the
-	 * offset is aligned, then it doesn't get changed so we zero from the
-	 * start of the block offset points to.
+	 * Punch a hole and prealloc the range. We use hole punch rather than
+	 * unwritten extent conversion for two reasons:
+	 *
+	 * 1.) Hole punch handles partial block zeroing for us.
+	 *
+	 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
+	 * by virtue of the hole punch.
 	 */
-	start_boundary = round_up(offset, granularity);
-	end_boundary = round_down(offset + len, granularity);
-
-	ASSERT(start_boundary >= offset);
-	ASSERT(end_boundary <= offset + len);
-
-	if (start_boundary < end_boundary - 1) {
-		/*
-		 * Writeback the range to ensure any inode size updates due to
-		 * appending writes make it to disk (otherwise we could just
-		 * punch out the delalloc blocks).
-		 */
-		error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-				start_boundary, end_boundary - 1);
-		if (error)
-			goto out;
-		truncate_pagecache_range(VFS_I(ip), start_boundary,
-					 end_boundary - 1);
-
-		/* convert the blocks */
-		error = xfs_alloc_file_space(ip, start_boundary,
-					end_boundary - start_boundary - 1,
-					XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
-		if (error)
-			goto out;
-
-		/* We've handled the interior of the range, now for the edges */
-		if (start_boundary != offset) {
-			error = xfs_iozero(ip, offset, start_boundary - offset);
-			if (error)
-				goto out;
-		}
-
-		if (end_boundary != offset + len)
-			error = xfs_iozero(ip, end_boundary,
-					   offset + len - end_boundary);
-
-	} else {
-		/*
-		 * It's either a sub-granularity range or the range spanned lies
-		 * partially across two adjacent blocks.
-		 */
-		error = xfs_iozero(ip, offset, len);
-	}
+	error = xfs_free_file_space(ip, offset, len);
+	if (error)
+		goto out;
 
+	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
+				     round_up(offset + len, blksize) -
+				     round_down(offset, blksize),
+				     XFS_BMAPI_PREALLOC);
 out:
 	return error;
 
-- 
cgit v1.2.3


From 9378c6768e4fca48971e7b6a9075bc006eda981d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:52:57 -0400
Subject: ext4: fix overflow when updating superblock backups after resize

When there are no meta block groups update_backups() will compute the
backup block in 32-bit arithmetics thus possibly overflowing the block
number and corrupting the filesystem. OTOH filesystems without meta
block groups larger than 16 TB should be rare. Fix the problem by doing
the counting in 64-bit arithmetics.

Coverity-id: 741252
CC: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/resize.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f298c60f907d..ca4588388fc3 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1081,7 +1081,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
 			break;
 
 		if (meta_bg == 0)
-			backup_block = group * bpg + blk_off;
+			backup_block = ((ext4_fsblk_t)group) * bpg + blk_off;
 		else
 			backup_block = (ext4_group_first_block_no(sb, group) +
 					ext4_bg_has_super(sb, group));
-- 
cgit v1.2.3


From 599a9b77ab289d85c2d5c8607624efbe1f552b0f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: fix oops when loading block bitmap failed

When we fail to load block bitmap in __ext4_new_inode() we will
dereference NULL pointer in ext4_journal_get_write_access(). So check
for error from ext4_read_block_bitmap().

Coverity-id: 989065
Cc: stable@vger.kernel.org
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8012a5daf401..ac644c31ca67 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -887,6 +887,10 @@ got:
 		struct buffer_head *block_bitmap_bh;
 
 		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+		if (!block_bitmap_bh) {
+			err = -EIO;
+			goto out;
+		}
 		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
 		err = ext4_journal_get_write_access(handle, block_bitmap_bh);
 		if (err) {
-- 
cgit v1.2.3


From 98c1a7593fa355fda7f5a5940c8bf5326ca964ba Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: enable journal checksum when metadata checksum feature enabled

If metadata checksumming is turned on for the FS, we need to tell the
journal to use checksumming too.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1eda6ab0ef9d..5c11e2153e36 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3526,6 +3526,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	set_opt(sb, POSIX_ACL);
 #endif
+	/* don't forget to enable journal_csum when metadata_csum is enabled. */
+	if (ext4_has_metadata_csum(sb))
+		set_opt(sb, JOURNAL_CHECKSUM);
+
 	if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
 		set_opt(sb, JOURNAL_DATA);
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
-- 
cgit v1.2.3


From 6b992ff25658367089db4a82666e232b65d55eae Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: disallow changing journal_csum option during remount

ext4 does not permit changing the metadata or journal checksum feature
flag while mounted.  Until we decide to support that, don't allow a
remount to change the journal_csum flag (right now we silently fail to
change anything).

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5c11e2153e36..96059e0ef165 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4845,6 +4845,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
+	if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
+	    test_opt(sb, JOURNAL_CHECKSUM)) {
+		ext4_msg(sb, KERN_ERR, "changing journal_checksum "
+			 "during remount not supported");
+		err = -EINVAL;
+		goto restore_opts;
+	}
+
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
 		if (test_opt2(sb, EXPLICIT_DELALLOC)) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
-- 
cgit v1.2.3


From 50460fe8c6d1d95b16427936e351f277a1c72d43 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: remove extent status procfs files if journal load fails

If we can't load the journal, remove the procfs files for the extent
status information file to avoid leaking resources.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 96059e0ef165..2c9e6864abd9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3947,7 +3947,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
 	    !(sb->s_flags & MS_RDONLY))
 		if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
-			goto failed_mount3;
+			goto failed_mount3a;
 
 	/*
 	 * The first inode we look at is the journal inode.  Don't try
@@ -3956,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (!test_opt(sb, NOLOAD) &&
 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext4_load_journal(sb, es, journal_devnum))
-			goto failed_mount3;
+			goto failed_mount3a;
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -4244,6 +4244,7 @@ failed_mount_wq:
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
+failed_mount3a:
 	ext4_es_unregister_shrinker(sbi);
 failed_mount3:
 	del_timer_sync(&sbi->s_err_report);
-- 
cgit v1.2.3


From a41537e69b4aa43f0fea02498c2595a81267383b Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Thu, 30 Oct 2014 10:53:16 -0400
Subject: ext4: prevent bugon on race between write/fcntl

O_DIRECT flags can be toggeled via fcntl(F_SETFL). But this value checked
twice inside ext4_file_write_iter() and __generic_file_write() which
result in BUG_ON inside ext4_direct_IO.

Let's initialize iocb->private unconditionally.

TESTCASE: xfstest:generic/036  https://patchwork.ozlabs.org/patch/402445/

#TYPICAL STACK TRACE:
kernel BUG at fs/ext4/inode.c:2960!
invalid opcode: 0000 [#1] SMP
Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod
CPU: 6 PID: 5505 Comm: aio-dio-fcntl-r Not tainted 3.17.0-rc2-00176-gff5c017 #161
Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011
task: ffff88080e95a7c0 ti: ffff88080f908000 task.ti: ffff88080f908000
RIP: 0010:[<ffffffff811fabf2>]  [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
RSP: 0018:ffff88080f90bb58  EFLAGS: 00010246
RAX: 0000000000000400 RBX: ffff88080fdb2a28 RCX: 00000000a802c818
RDX: 0000040000080000 RSI: ffff88080d8aeb80 RDI: 0000000000000001
RBP: ffff88080f90bbc8 R08: 0000000000000000 R09: 0000000000001581
R10: 0000000000000000 R11: 0000000000000000 R12: ffff88080d8aeb80
R13: ffff88080f90bbf8 R14: ffff88080fdb28c8 R15: ffff88080fdb2a28
FS:  00007f23b2055700(0000) GS:ffff880818400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f23b2045000 CR3: 000000080cedf000 CR4: 00000000000407e0
Stack:
 ffff88080f90bb98 0000000000000000 7ffffffffffffffe ffff88080fdb2c30
 0000000000000200 0000000000000200 0000000000000001 0000000000000200
 ffff88080f90bbc8 ffff88080fdb2c30 ffff88080f90be08 0000000000000200
Call Trace:
 [<ffffffff8112ca9d>] generic_file_direct_write+0xed/0x180
 [<ffffffff8112f2b2>] __generic_file_write_iter+0x222/0x370
 [<ffffffff811f495b>] ext4_file_write_iter+0x34b/0x400
 [<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
 [<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410
 [<ffffffff810990e5>] ? local_clock+0x25/0x30
 [<ffffffff810abd94>] ? __lock_acquire+0x274/0x700
 [<ffffffff811f4610>] ? ext4_unwritten_wait+0xb0/0xb0
 [<ffffffff811bd756>] aio_run_iocb+0x286/0x410
 [<ffffffff810990e5>] ? local_clock+0x25/0x30
 [<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190
 [<ffffffff811bc05b>] ? lookup_ioctx+0x4b/0xf0
 [<ffffffff811bde3b>] do_io_submit+0x55b/0x740
 [<ffffffff811bdcaa>] ? do_io_submit+0x3ca/0x740
 [<ffffffff811be030>] SyS_io_submit+0x10/0x20
 [<ffffffff815ce192>] system_call_fastpath+0x16/0x1b
Code: 01 48 8b 80 f0 01 00 00 48 8b 18 49 8b 45 10 0f 85 f1 01 00 00 48 03 45 c8 48 3b 43 48 0f 8f e3 01 00 00 49 83 7c
24 18 00 75 04 <0f> 0b eb fe f0 ff 83 ec 01 00 00 49 8b 44 24 18 8b 00 85 c0 89
RIP  [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0
 RSP <ffff88080f90bb58>

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Cc: stable@vger.kernel.org
---
 fs/ext4/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24a4432..8131be8c0af3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -137,10 +137,10 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
 	}
 
+	iocb->private = &overwrite;
 	if (o_direct) {
 		blk_start_plug(&plug);
 
-		iocb->private = &overwrite;
 
 		/* check whether we do a DIO overwrite or not */
 		if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
-- 
cgit v1.2.3


From d48458d4a768cece43f80a081a26cf912877da9c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 30 Oct 2014 10:53:17 -0400
Subject: jbd2: use a better hash function for the revoke table

The old hash function didn't work well for 64-bit block numbers, and
used undefined (negative) shift right behavior.  Use the generic
64-bit hash function instead.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reported-by: Andrey Ryabinin <a.ryabinin@samsung.com>
---
 fs/jbd2/revoke.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index d5e95a175c92..c6cbaef2bda1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -92,6 +92,7 @@
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/log2.h>
+#include <linux/hash.h>
 #endif
 
 static struct kmem_cache *jbd2_revoke_record_cache;
@@ -130,16 +131,9 @@ static void flush_descriptor(journal_t *, struct buffer_head *, int, int);
 
 /* Utility functions to maintain the revoke table */
 
-/* Borrowed from buffer.c: this is a tried and tested block hash function */
 static inline int hash(journal_t *journal, unsigned long long block)
 {
-	struct jbd2_revoke_table_s *table = journal->j_revoke;
-	int hash_shift = table->hash_shift;
-	int hash = (int)block ^ (int)((block >> 31) >> 1);
-
-	return ((hash << (hash_shift - 6)) ^
-		(hash >> 13) ^
-		(hash << (hash_shift - 12))) & (table->hash_size - 1);
+	return hash_64(block, journal->j_revoke->hash_shift);
 }
 
 static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr,
-- 
cgit v1.2.3


From 6050d47adcadbb53582434d919ed7f038d936712 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:53:17 -0400
Subject: ext4: bail out from make_indexed_dir() on first error

When ext4_handle_dirty_dx_node() or ext4_handle_dirty_dirent_node()
fail, there's really something wrong with the fs and there's no point in
continuing further. Just return error from make_indexed_dir() in that
case. Also initialize frames array so that if we return early due to
error, dx_release() doesn't try to dereference uninitialized memory
(which could happen also due to error in do_split()).

Coverity-id: 741300
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/namei.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 123798c5ac31..426211882f72 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1816,31 +1816,39 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 		hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
 	hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
 	ext4fs_dirhash(name, namelen, &hinfo);
+	memset(frames, 0, sizeof(frames));
 	frame = frames;
 	frame->entries = entries;
 	frame->at = entries;
 	frame->bh = bh;
 	bh = bh2;
 
-	ext4_handle_dirty_dx_node(handle, dir, frame->bh);
-	ext4_handle_dirty_dirent_node(handle, dir, bh);
+	retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+	if (retval)
+		goto out_frames;	
+	retval = ext4_handle_dirty_dirent_node(handle, dir, bh);
+	if (retval)
+		goto out_frames;	
 
 	de = do_split(handle,dir, &bh, frame, &hinfo);
 	if (IS_ERR(de)) {
-		/*
-		 * Even if the block split failed, we have to properly write
-		 * out all the changes we did so far. Otherwise we can end up
-		 * with corrupted filesystem.
-		 */
-		ext4_mark_inode_dirty(handle, dir);
-		dx_release(frames);
-		return PTR_ERR(de);
+		retval = PTR_ERR(de);
+		goto out_frames;
 	}
 	dx_release(frames);
 
 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
 	brelse(bh);
 	return retval;
+out_frames:
+	/*
+	 * Even if the block split failed, we have to properly write
+	 * out all the changes we did so far. Otherwise we can end up
+	 * with corrupted filesystem.
+	 */
+	ext4_mark_inode_dirty(handle, dir);
+	dx_release(frames);
+	return retval;
 }
 
 /*
-- 
cgit v1.2.3


From 4f879ca687a5f2473b952937ce92c795a39019b4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:53:17 -0400
Subject: ext4: bail early when clearing inode journal flag fails

When clearing inode journal flag, we call jbd2_journal_flush() to force
all the journalled data to their final locations. Currently we ignore
when this fails and continue clearing inode journal flag. This isn't a
big problem because when jbd2_journal_flush() fails, journal is likely
aborted anyway. But it can still lead to somewhat confusing results so
rather bail out early.

Coverity-id: 989044
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e9777f93cf05..3356ab5395f4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4959,7 +4959,12 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	if (val)
 		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else {
-		jbd2_journal_flush(journal);
+		err = jbd2_journal_flush(journal);
+		if (err < 0) {
+			jbd2_journal_unlock_updates(journal);
+			ext4_inode_resume_unlocked_dio(inode);
+			return err;
+		}
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	}
 	ext4_set_aops(inode);
-- 
cgit v1.2.3


From ae9e9c6aeea6f91ccb4fb369d7dd8f1a8b5f6a58 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 30 Oct 2014 10:53:17 -0400
Subject: ext4: make ext4_ext_convert_to_initialized() return proper number of
 blocks

ext4_ext_convert_to_initialized() can return more blocks than are
actually allocated from map->m_lblk in case where initial part of the
on-disk extent is zeroed out. Luckily this doesn't have serious
consequences because the caller currently uses the return value
only to unmap metadata buffers. Anyway this is a data
corruption/exposure problem waiting to happen so fix it.

Coverity-id: 1226848
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/extents.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 37043d0b2be8..0b16fb4c06d3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3603,11 +3603,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		}
 	}
 
-	allocated = ext4_split_extent(handle, inode, ppath,
-				      &split_map, split_flag, flags);
-	if (allocated < 0)
-		err = allocated;
-
+	err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag,
+				flags);
+	if (err > 0)
+		err = 0;
 out:
 	/* If we have gotten a failure, don't zero out status tree */
 	if (!err)
-- 
cgit v1.2.3


From 69a91c237ab0ebe4e9fdeaf6d0090c85275594ec Mon Sep 17 00:00:00 2001
From: Eric Rannaud <e@nanocritical.com>
Date: Thu, 30 Oct 2014 01:51:01 -0700
Subject: fs: allow open(dir, O_TMPFILE|..., 0) with mode 0

The man page for open(2) indicates that when O_CREAT is specified, the
'mode' argument applies only to future accesses to the file:

	Note that this mode applies only to future accesses of the newly
	created file; the open() call that creates a read-only file
	may well return a read/write file descriptor.

The man page for open(2) implies that 'mode' is treated identically by
O_CREAT and O_TMPFILE.

O_TMPFILE, however, behaves differently:

	int fd = open("/tmp", O_TMPFILE | O_RDWR, 0);
	assert(fd == -1);
	assert(errno == EACCES);

	int fd = open("/tmp", O_TMPFILE | O_RDWR, 0600);
	assert(fd > 0);

For O_CREAT, do_last() sets acc_mode to MAY_OPEN only:

	if (*opened & FILE_CREATED) {
		/* Don't check for write permission, don't truncate */
		open_flag &= ~O_TRUNC;
		will_truncate = false;
		acc_mode = MAY_OPEN;
		path_to_nameidata(path, nd);
		goto finish_open_created;
	}

But for O_TMPFILE, do_tmpfile() passes the full op->acc_mode to
may_open().

This patch lines up the behavior of O_TMPFILE with O_CREAT. After the
inode is created, may_open() is called with acc_mode = MAY_OPEN, in
do_tmpfile().

A different, but related glibc bug revealed the discrepancy:
https://sourceware.org/bugzilla/show_bug.cgi?id=17523

The glibc lazily loads the 'mode' argument of open() and openat() using
va_arg() only if O_CREAT is present in 'flags' (to support both the 2
argument and the 3 argument forms of open; same idea for openat()).
However, the glibc ignores the 'mode' argument if O_TMPFILE is in
'flags'.

On x86_64, for open(), it magically works anyway, as 'mode' is in
RDX when entering open(), and is still in RDX on SYSCALL, which is where
the kernel looks for the 3rd argument of a syscall.

But openat() is not quite so lucky: 'mode' is in RCX when entering the
glibc wrapper for openat(), while the kernel looks for the 4th argument
of a syscall in R10. Indeed, the syscall calling convention differs from
the regular calling convention in this respect on x86_64. So the kernel
sees mode = 0 when trying to use glibc openat() with O_TMPFILE, and
fails with EACCES.

Signed-off-by: Eric Rannaud <e@nanocritical.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 42df664e95e5..78512898d3ba 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3154,7 +3154,8 @@ static int do_tmpfile(int dfd, struct filename *pathname,
 	if (error)
 		goto out2;
 	audit_inode(pathname, nd->path.dentry, 0);
-	error = may_open(&nd->path, op->acc_mode, op->open_flag);
+	/* Don't check for other permissions, the inode was just created */
+	error = may_open(&nd->path, MAY_OPEN, op->open_flag);
 	if (error)
 		goto out2;
 	file->f_path.mnt = nd->path.mnt;
-- 
cgit v1.2.3


From 6e5aafb27419f32575b27ef9d6a31e5d54661aca Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Tue, 4 Nov 2014 06:59:04 -0800
Subject: Btrfs: fix kfree on list_head in btrfs_lookup_csums_range error
 cleanup

If we hit any errors in btrfs_lookup_csums_range, we'll loop through all
the csums we allocate and free them.  But the code was using list_entry
incorrectly, and ended up trying to free the on-stack list_head instead.

This bug came from commit 0678b6185

btrfs: Don't BUG_ON kzalloc error in btrfs_lookup_csums_range()

Signed-off-by: Chris Mason <clm@fb.com>
Reported-by: Erik Berg <btrfs@slipsprogrammoer.no>
cc: stable@vger.kernel.org # 3.3 or newer
---
 fs/btrfs/file-item.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 783a94355efd..84a2d1868271 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -413,7 +413,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	ret = 0;
 fail:
 	while (ret < 0 && !list_empty(&tmplist)) {
-		sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+		sums = list_entry(tmplist.next, struct btrfs_ordered_sum, list);
 		list_del(&sums->list);
 		kfree(sums);
 	}
-- 
cgit v1.2.3


From 809fd143de8805970eec02c27c0bc2622a6ecbda Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 23 Oct 2014 19:33:14 +0300
Subject: NFSv4: Ensure nfs_atomic_open set the dentry verifier on ENOENT

If the OPEN rpc call to the server fails with an ENOENT call, nfs_atomic_open
will create a negative dentry for that file, however it currently fails
to call nfs_set_verifier(), thus causing the dentry to be immediately
revalidated on the next call to nfs_lookup_revalidate() instead of following
the usual lookup caching rules.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06e8cfcbb670..6e62155abf26 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1527,6 +1527,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		case -ENOENT:
 			d_drop(dentry);
 			d_add(dentry, NULL);
+			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 			break;
 		case -EISDIR:
 		case -ENOTDIR:
-- 
cgit v1.2.3


From 7488cbc2568391d5e0b2bda8902a96b5dd7b1ea7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 23 Oct 2014 19:22:31 +0300
Subject: Revert "NFS: remove BUG possibility in nfs4_open_and_get_state"

This reverts commit f39c01047994e66e7f3d89ddb4c6141f23349d8d.
---
 fs/nfs/nfs4proc.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 405bd95c1f58..8026197e2b9f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2233,13 +2233,9 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	ret = _nfs4_proc_open(opendata);
 	if (ret != 0) {
 		if (ret == -ENOENT) {
-			dentry = opendata->dentry;
-			if (dentry->d_inode)
-				d_delete(dentry);
-			else if (d_unhashed(dentry))
-				d_add(dentry, NULL);
-
-			nfs_set_verifier(dentry,
+			d_drop(opendata->dentry);
+			d_add(opendata->dentry, NULL);
+			nfs_set_verifier(opendata->dentry,
 					 nfs_save_change_attribute(opendata->dir->d_inode));
 		}
 		goto out;
-- 
cgit v1.2.3


From dca780016dab84d6ac500b1d84fdfe1628802a59 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 23 Oct 2014 19:23:03 +0300
Subject: Revert "NFS: nfs4_do_open should add negative results to the dcache."

This reverts commit 4fa2c54b5198d09607a534e2fd436581064587ed.
---
 fs/nfs/nfs4proc.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8026197e2b9f..41b8fcbfdadd 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2231,15 +2231,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 
 	ret = _nfs4_proc_open(opendata);
-	if (ret != 0) {
-		if (ret == -ENOENT) {
-			d_drop(opendata->dentry);
-			d_add(opendata->dentry, NULL);
-			nfs_set_verifier(opendata->dentry,
-					 nfs_save_change_attribute(opendata->dir->d_inode));
-		}
+	if (ret != 0)
 		goto out;
-	}
 
 	state = nfs4_opendata_to_nfs4_state(opendata);
 	ret = PTR_ERR(state);
-- 
cgit v1.2.3


From 3f822c6264954660babce757fb45792fd3af273e Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 4 Nov 2014 16:11:03 +0100
Subject: ovl: don't poison cursor

ovl_cache_put() can be called from ovl_dir_reset() if the cache needs to be
rebuilt.  We did list_del() on the cursor, which results in an Oops on the
poisoned pointer in ovl_seek_cursor().

Reported-by: Jordi Pujol Palomer <jordipujolp@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Tested-by: Jordi Pujol Palomer <jordipujolp@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/overlayfs/readdir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 4e9d7c1fea52..2a7ef4f8e2a6 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -168,7 +168,7 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
 {
 	struct ovl_dir_cache *cache = od->cache;
 
-	list_del(&od->cursor.l_node);
+	list_del_init(&od->cursor.l_node);
 	WARN_ON(cache->refcount <= 0);
 	cache->refcount--;
 	if (!cache->refcount) {
-- 
cgit v1.2.3


From 7e8631e8b9d4e9f698c09c7e7309c96249180ff9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 5 Nov 2014 15:18:29 -0500
Subject: fix breakage in o2net_send_tcp_msg()

uninitialized msghdr.  Broken in "ocfs2: don't open-code kernel_recvmsg()"
by me ;-/

Cc: stable@vger.kernel.org # 3.15+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/cluster/tcp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 97de0fbd9f78..a96044004064 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -925,7 +925,7 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
 			      size_t veclen, size_t total)
 {
 	int ret;
-	struct msghdr msg;
+	struct msghdr msg = {.msg_flags = 0,};
 
 	if (sock == NULL) {
 		ret = -EINVAL;
-- 
cgit v1.2.3


From afa947cb52a8e73fe71915a0b0af6fcf98dfbe1a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:29:57 +1100
Subject: xfs: bulkstat btree walk doesn't terminate

The bulkstat code has several different ways of detecting the end of
an AG when doing a walk. They are not consistently detected, and the
code that checks for the end of AG conditions is not consistently
coded. Hence the are conditions where the walk code can get stuck in
an endless loop making no progress and not triggering any
termination conditions.

Convert all the "tmp/i" status return codes from btree operations
to a common name (stat) and apply end-of-ag detection to these
operations consistently.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7765ff743e91..16737cbbee17 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -356,7 +356,6 @@ xfs_bulkstat(
 	int			end_of_ag; /* set if we've seen the ag end */
 	int			error;	/* error code */
 	int                     fmterror;/* bulkstat formatter result */
-	int			i;	/* loop index */
 	int			icount;	/* count of inodes good in irbuf */
 	size_t			irbsize; /* size of irec buffer in bytes */
 	xfs_ino_t		ino;	/* inode number (filesystem) */
@@ -366,11 +365,11 @@ xfs_bulkstat(
 	xfs_ino_t		lastino; /* last inode number returned */
 	int			nirbuf;	/* size of irbuf */
 	int			rval;	/* return value error code */
-	int			tmp;	/* result value from btree calls */
 	int			ubcount; /* size of user's buffer */
 	int			ubleft;	/* bytes left in user's buffer */
 	char			__user *ubufp;	/* pointer into user's buffer */
 	int			ubelem;	/* spaces used in user's buffer */
+	int			stat;
 
 	/*
 	 * Get the last inode value, see if there's nothing to do.
@@ -436,13 +435,15 @@ xfs_bulkstat(
 				agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			}
 			/* Increment to the next record */
-			error = xfs_btree_increment(cur, 0, &tmp);
+			error = xfs_btree_increment(cur, 0, &stat);
 		} else {
 			/* Start of ag.  Lookup the first inode chunk */
-			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
+			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
 		}
-		if (error)
+		if (error || stat == 0) {
+			end_of_ag = 1;
 			goto del_cursor;
+		}
 
 		/*
 		 * Loop through inode btree records in this ag,
@@ -451,8 +452,8 @@ xfs_bulkstat(
 		while (irbp < irbufend && icount < ubcount) {
 			struct xfs_inobt_rec_incore	r;
 
-			error = xfs_inobt_get_rec(cur, &r, &i);
-			if (error || i == 0) {
+			error = xfs_inobt_get_rec(cur, &r, &stat);
+			if (error || stat == 0) {
 				end_of_ag = 1;
 				goto del_cursor;
 			}
@@ -473,8 +474,8 @@ xfs_bulkstat(
 			 * Set agino to after this chunk and bump the cursor.
 			 */
 			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
-			error = xfs_btree_increment(cur, 0, &tmp);
-			if (error) {
+			error = xfs_btree_increment(cur, 0, &stat);
+			if (error || stat == 0) {
 				end_of_ag = 1;
 				goto del_cursor;
 			}
-- 
cgit v1.2.3


From bf4a5af20d25ecc8876978ad34b8db83b4235f3c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:30:30 +1100
Subject: xfs: bulkstat chunk formatting cursor is broken

The xfs_bulkstat_agichunk formatting cursor takes buffer values from
the main loop and passes them via the structure to the chunk
formatter, and the writes the changed values back into the main loop
local variables. Unfortunately, this complex dance is full of corner
cases that aren't handled correctly.

The biggest problem is that it is double handling the information in
both the main loop and the chunk formatting function, leading to
inconsistent updates and endless loops where progress is not made.

To fix this, push the struct xfs_bulkstat_agichunk outwards to be
the primary holder of user buffer information. this removes the
double handling in the main loop.

Also, pass the last inode processed by the chunk formatter as a
separate parameter as it purely an output variable and is not
related to the user buffer consumption cursor.

Finally, the chunk formatting code is not shared by anyone, so make
it local to xfs_itable.c.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 59 +++++++++++++++++++++++++----------------------------
 fs/xfs/xfs_itable.h | 16 ---------------
 2 files changed, 28 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 16737cbbee17..50a3e5995dd9 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -262,20 +262,26 @@ xfs_bulkstat_grab_ichunk(
 
 #define XFS_BULKSTAT_UBLEFT(ubleft)	((ubleft) >= statstruct_size)
 
+struct xfs_bulkstat_agichunk {
+	char		__user **ac_ubuffer;/* pointer into user's buffer */
+	int		ac_ubleft;	/* bytes left in user's buffer */
+	int		ac_ubelem;	/* spaces used in user's buffer */
+};
+
 /*
  * Process inodes in chunk with a pointer to a formatter function
  * that will iget the inode and fill in the appropriate structure.
  */
-int
+static int
 xfs_bulkstat_ag_ichunk(
 	struct xfs_mount		*mp,
 	xfs_agnumber_t			agno,
 	struct xfs_inobt_rec_incore	*irbp,
 	bulkstat_one_pf			formatter,
 	size_t				statstruct_size,
-	struct xfs_bulkstat_agichunk	*acp)
+	struct xfs_bulkstat_agichunk	*acp,
+	xfs_ino_t			*lastino)
 {
-	xfs_ino_t			lastino = acp->ac_lastino;
 	char				__user **ubufp = acp->ac_ubuffer;
 	int				ubleft = acp->ac_ubleft;
 	int				ubelem = acp->ac_ubelem;
@@ -295,7 +301,7 @@ xfs_bulkstat_ag_ichunk(
 
 		/* Skip if this inode is free */
 		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
-			lastino = ino;
+			*lastino = ino;
 			continue;
 		}
 
@@ -313,7 +319,7 @@ xfs_bulkstat_ag_ichunk(
 				ubleft = 0;
 				break;
 			}
-			lastino = ino;
+			*lastino = ino;
 			continue;
 		}
 		if (fmterror == BULKSTAT_RV_GIVEUP) {
@@ -325,10 +331,9 @@ xfs_bulkstat_ag_ichunk(
 			*ubufp += ubused;
 		ubleft -= ubused;
 		ubelem++;
-		lastino = ino;
+		*lastino = ino;
 	}
 
-	acp->ac_lastino = lastino;
 	acp->ac_ubleft = ubleft;
 	acp->ac_ubelem = ubelem;
 
@@ -355,7 +360,6 @@ xfs_bulkstat(
 	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
 	int			end_of_ag; /* set if we've seen the ag end */
 	int			error;	/* error code */
-	int                     fmterror;/* bulkstat formatter result */
 	int			icount;	/* count of inodes good in irbuf */
 	size_t			irbsize; /* size of irec buffer in bytes */
 	xfs_ino_t		ino;	/* inode number (filesystem) */
@@ -366,10 +370,8 @@ xfs_bulkstat(
 	int			nirbuf;	/* size of irbuf */
 	int			rval;	/* return value error code */
 	int			ubcount; /* size of user's buffer */
-	int			ubleft;	/* bytes left in user's buffer */
-	char			__user *ubufp;	/* pointer into user's buffer */
-	int			ubelem;	/* spaces used in user's buffer */
 	int			stat;
+	struct xfs_bulkstat_agichunk ac;
 
 	/*
 	 * Get the last inode value, see if there's nothing to do.
@@ -386,11 +388,13 @@ xfs_bulkstat(
 	}
 
 	ubcount = *ubcountp; /* statstruct's */
-	ubleft = ubcount * statstruct_size; /* bytes */
-	*ubcountp = ubelem = 0;
+	ac.ac_ubuffer = &ubuffer;
+	ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
+	ac.ac_ubelem = 0;
+
+	*ubcountp = 0;
 	*done = 0;
-	fmterror = 0;
-	ubufp = ubuffer;
+
 	irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
 	if (!irbuf)
 		return -ENOMEM;
@@ -402,7 +406,7 @@ xfs_bulkstat(
 	 * inode returned; 0 means start of the allocation group.
 	 */
 	rval = 0;
-	while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
+	while (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft) && agno < mp->m_sb.sb_agcount) {
 		cond_resched();
 		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
 		if (error)
@@ -497,28 +501,21 @@ del_cursor:
 		 */
 		irbufend = irbp;
 		for (irbp = irbuf;
-		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
-			struct xfs_bulkstat_agichunk ac;
-
-			ac.ac_lastino = lastino;
-			ac.ac_ubuffer = &ubuffer;
-			ac.ac_ubleft = ubleft;
-			ac.ac_ubelem = ubelem;
+		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ac.ac_ubleft);
+		     irbp++) {
 			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
-					formatter, statstruct_size, &ac);
+					formatter, statstruct_size, &ac,
+					&lastino);
 			if (error)
 				rval = error;
 
-			lastino = ac.ac_lastino;
-			ubleft = ac.ac_ubleft;
-			ubelem = ac.ac_ubelem;
-
 			cond_resched();
 		}
+
 		/*
 		 * Set up for the next loop iteration.
 		 */
-		if (XFS_BULKSTAT_UBLEFT(ubleft)) {
+		if (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft)) {
 			if (end_of_ag) {
 				agno++;
 				agino = 0;
@@ -531,11 +528,11 @@ del_cursor:
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
 	kmem_free(irbuf);
-	*ubcountp = ubelem;
+	*ubcountp = ac.ac_ubelem;
 	/*
 	 * Found some inodes, return them now and return the error next time.
 	 */
-	if (ubelem)
+	if (ac.ac_ubelem)
 		rval = 0;
 	if (agno >= mp->m_sb.sb_agcount) {
 		/*
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index aaed08022eb9..6ea8b3912fa4 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount	*mp,
 			       int		*ubused,
 			       int		*stat);
 
-struct xfs_bulkstat_agichunk {
-	xfs_ino_t	ac_lastino;	/* last inode returned */
-	char		__user **ac_ubuffer;/* pointer into user's buffer */
-	int		ac_ubleft;	/* bytes left in user's buffer */
-	int		ac_ubelem;	/* spaces used in user's buffer */
-};
-
-int
-xfs_bulkstat_ag_ichunk(
-	struct xfs_mount		*mp,
-	xfs_agnumber_t			agno,
-	struct xfs_inobt_rec_incore	*irbp,
-	bulkstat_one_pf			formatter,
-	size_t				statstruct_size,
-	struct xfs_bulkstat_agichunk	*acp);
-
 /*
  * Values for stat return value.
  */
-- 
cgit v1.2.3


From 2b831ac6bc87d3cbcbb1a8816827b6923403e461 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:30:58 +1100
Subject: xfs: bulkstat chunk-formatter has issues

The loop construct has issues:
	- clustidx is completely unused, so remove it.
	- the loop tries to be smart by terminating when the
	  "freecount" tells it that all inodes are free. Just drop
	  it as in most cases we have to scan all inodes in the
	  chunk anyway.
	- move the "user buffer left" condition check to the only
	  point where we consume space int eh user buffer.
	- move the initialisation of agino out of the loop, leaving
	  just a simple loop control logic using the clusteridx.

Also, double handling of the user buffer variables leads to problems
tracking the current state - use the cursor variables directly
rather than keeping local copies and then having to update the
cursor before returning.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 58 ++++++++++++++++++++++-------------------------------
 1 file changed, 24 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 50a3e5995dd9..7ea2b113db1b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -283,59 +283,49 @@ xfs_bulkstat_ag_ichunk(
 	xfs_ino_t			*lastino)
 {
 	char				__user **ubufp = acp->ac_ubuffer;
-	int				ubleft = acp->ac_ubleft;
-	int				ubelem = acp->ac_ubelem;
-	int				chunkidx, clustidx;
+	int				chunkidx;
 	int				error = 0;
 	xfs_agino_t			agino;
 
-	for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
-	     XFS_BULKSTAT_UBLEFT(ubleft) &&
-	     irbp->ir_freecount < XFS_INODES_PER_CHUNK;
-	     chunkidx++, clustidx++, agino++) {
-		int		fmterror;	/* bulkstat formatter result */
+	agino = irbp->ir_startino;
+	for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
+	     chunkidx++, agino++) {
+		int		fmterror;
 		int		ubused;
 		xfs_ino_t	ino = XFS_AGINO_TO_INO(mp, agno, agino);
 
-		ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
-
 		/* Skip if this inode is free */
 		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
 			*lastino = ino;
 			continue;
 		}
 
-		/*
-		 * Count used inodes as free so we can tell when the
-		 * chunk is used up.
-		 */
-		irbp->ir_freecount++;
-
 		/* Get the inode and fill in a single buffer */
 		ubused = statstruct_size;
-		error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
-		if (fmterror == BULKSTAT_RV_NOTHING) {
-			if (error && error != -ENOENT && error != -EINVAL) {
-				ubleft = 0;
-				break;
-			}
-			*lastino = ino;
-			continue;
-		}
-		if (fmterror == BULKSTAT_RV_GIVEUP) {
-			ubleft = 0;
+		error = formatter(mp, ino, *ubufp, acp->ac_ubleft,
+				  &ubused, &fmterror);
+		if (fmterror == BULKSTAT_RV_GIVEUP ||
+		    (error && error != -ENOENT && error != -EINVAL)) {
+			acp->ac_ubleft = 0;
 			ASSERT(error);
 			break;
 		}
-		if (*ubufp)
-			*ubufp += ubused;
-		ubleft -= ubused;
-		ubelem++;
+
+		/* be careful not to leak error if at end of chunk */
+		if (fmterror == BULKSTAT_RV_NOTHING || error) {
+			*lastino = ino;
+			error = 0;
+			continue;
+		}
+
+		*ubufp += ubused;
+		acp->ac_ubleft -= ubused;
+		acp->ac_ubelem++;
 		*lastino = ino;
-	}
 
-	acp->ac_ubleft = ubleft;
-	acp->ac_ubelem = ubelem;
+		if (acp->ac_ubleft < statstruct_size)
+			break;
+	}
 
 	return error;
 }
-- 
cgit v1.2.3


From 6e57c542cb7e0e580eb53ae76a77875c7d92b4b1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:31:13 +1100
Subject: xfs: bulkstat main loop logic is a mess

There are a bunch of variables tha tare more wildy scoped than they
need to be, obfuscated user buffer checks and tortured "next inode"
tracking. This all needs cleaning up to expose the real issues that
need fixing.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 56 +++++++++++++++++++++++------------------------------
 1 file changed, 24 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7ea2b113db1b..acae3355ab22 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -348,30 +348,23 @@ xfs_bulkstat(
 	xfs_agino_t		agino;	/* inode # in allocation group */
 	xfs_agnumber_t		agno;	/* allocation group number */
 	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
-	int			end_of_ag; /* set if we've seen the ag end */
-	int			error;	/* error code */
-	int			icount;	/* count of inodes good in irbuf */
 	size_t			irbsize; /* size of irec buffer in bytes */
-	xfs_ino_t		ino;	/* inode number (filesystem) */
-	xfs_inobt_rec_incore_t	*irbp;	/* current irec buffer pointer */
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
-	xfs_inobt_rec_incore_t	*irbufend; /* end of good irec buffer entries */
 	xfs_ino_t		lastino; /* last inode number returned */
 	int			nirbuf;	/* size of irbuf */
 	int			rval;	/* return value error code */
 	int			ubcount; /* size of user's buffer */
-	int			stat;
 	struct xfs_bulkstat_agichunk ac;
+	int			error = 0;
 
 	/*
 	 * Get the last inode value, see if there's nothing to do.
 	 */
-	ino = (xfs_ino_t)*lastinop;
-	lastino = ino;
-	agno = XFS_INO_TO_AGNO(mp, ino);
-	agino = XFS_INO_TO_AGINO(mp, ino);
+	lastino = *lastinop;
+	agno = XFS_INO_TO_AGNO(mp, lastino);
+	agino = XFS_INO_TO_AGINO(mp, lastino);
 	if (agno >= mp->m_sb.sb_agcount ||
-	    ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+	    lastino != XFS_AGINO_TO_INO(mp, agno, agino)) {
 		*done = 1;
 		*ubcountp = 0;
 		return 0;
@@ -396,8 +389,13 @@ xfs_bulkstat(
 	 * inode returned; 0 means start of the allocation group.
 	 */
 	rval = 0;
-	while (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft) && agno < mp->m_sb.sb_agcount) {
-		cond_resched();
+	while (agno < mp->m_sb.sb_agcount) {
+		struct xfs_inobt_rec_incore	*irbp = irbuf;
+		struct xfs_inobt_rec_incore	*irbufend = irbuf + nirbuf;
+		bool				end_of_ag = false;
+		int				icount = 0;
+		int				stat;
+
 		error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
 		if (error)
 			break;
@@ -407,10 +405,6 @@ xfs_bulkstat(
 		 */
 		cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
 					    XFS_BTNUM_INO);
-		irbp = irbuf;
-		irbufend = irbuf + nirbuf;
-		end_of_ag = 0;
-		icount = 0;
 		if (agino > 0) {
 			/*
 			 * In the middle of an allocation group, we need to get
@@ -435,7 +429,7 @@ xfs_bulkstat(
 			error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
 		}
 		if (error || stat == 0) {
-			end_of_ag = 1;
+			end_of_ag = true;
 			goto del_cursor;
 		}
 
@@ -448,7 +442,7 @@ xfs_bulkstat(
 
 			error = xfs_inobt_get_rec(cur, &r, &stat);
 			if (error || stat == 0) {
-				end_of_ag = 1;
+				end_of_ag = true;
 				goto del_cursor;
 			}
 
@@ -470,7 +464,7 @@ xfs_bulkstat(
 			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			error = xfs_btree_increment(cur, 0, &stat);
 			if (error || stat == 0) {
-				end_of_ag = 1;
+				end_of_ag = true;
 				goto del_cursor;
 			}
 			cond_resched();
@@ -491,7 +485,7 @@ del_cursor:
 		 */
 		irbufend = irbp;
 		for (irbp = irbuf;
-		     irbp < irbufend && XFS_BULKSTAT_UBLEFT(ac.ac_ubleft);
+		     irbp < irbufend && ac.ac_ubleft >= statstruct_size;
 		     irbp++) {
 			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
 					formatter, statstruct_size, &ac,
@@ -502,17 +496,15 @@ del_cursor:
 			cond_resched();
 		}
 
-		/*
-		 * Set up for the next loop iteration.
-		 */
-		if (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft)) {
-			if (end_of_ag) {
-				agno++;
-				agino = 0;
-			} else
-				agino = XFS_INO_TO_AGINO(mp, lastino);
-		} else
+		/* If we've run out of space, we are done */
+		if (ac.ac_ubleft < statstruct_size)
 			break;
+
+		if (end_of_ag) {
+			agno++;
+			agino = 0;
+		} else
+			agino = XFS_INO_TO_AGINO(mp, lastino);
 	}
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
-- 
cgit v1.2.3


From febe3cbe38b0bc0a925906dc90e8d59048851f87 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:31:15 +1100
Subject: xfs: bulkstat error handling is broken

The error propagation is a horror - xfs_bulkstat() returns
a rval variable which is only set if there are formatter errors. Any
sort of btree walk error or corruption will cause the bulkstat walk
to terminate but will not pass an error back to userspace. Worse
is the fact that formatter errors will also be ignored if any inodes
were correctly formatted into the user buffer.

Hence bulkstat can fail badly yet still report success to userspace.
This causes significant issues with xfsdump not dumping everything
in the filesystem yet reporting success. It's not until a restore
fails that there is any indication that the dump was bad and tha
bulkstat failed. This patch now triggers xfsdump to fail with
bulkstat errors rather than silently missing files in the dump.

This now causes bulkstat to fail when the lastino cookie does not
fall inside an existing inode chunk. The pre-3.17 code tolerated
that error by allowing the code to move to the next inode chunk
as the agino target is guaranteed to fall into the next btree
record.

With the fixes up to this point in the series, xfsdump now passes on
the troublesome filesystem image that exposes all these bugs.

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_itable.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index acae3355ab22..ff3f431671b9 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk(
 	XFS_WANT_CORRUPTED_RETURN(stat == 1);
 
 	/* Check if the record contains the inode in request */
-	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
-		return -EINVAL;
+	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
+		*icount = 0;
+		return 0;
+	}
 
 	idx = agino - irec->ir_startino + 1;
 	if (idx < XFS_INODES_PER_CHUNK &&
@@ -352,7 +354,6 @@ xfs_bulkstat(
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
 	xfs_ino_t		lastino; /* last inode number returned */
 	int			nirbuf;	/* size of irbuf */
-	int			rval;	/* return value error code */
 	int			ubcount; /* size of user's buffer */
 	struct xfs_bulkstat_agichunk ac;
 	int			error = 0;
@@ -388,7 +389,6 @@ xfs_bulkstat(
 	 * Loop over the allocation groups, starting from the last
 	 * inode returned; 0 means start of the allocation group.
 	 */
-	rval = 0;
 	while (agno < mp->m_sb.sb_agcount) {
 		struct xfs_inobt_rec_incore	*irbp = irbuf;
 		struct xfs_inobt_rec_incore	*irbufend = irbuf + nirbuf;
@@ -491,13 +491,16 @@ del_cursor:
 					formatter, statstruct_size, &ac,
 					&lastino);
 			if (error)
-				rval = error;
+				break;
 
 			cond_resched();
 		}
 
-		/* If we've run out of space, we are done */
-		if (ac.ac_ubleft < statstruct_size)
+		/*
+		 * If we've run out of space or had a formatting error, we
+		 * are now done
+		 */
+		if (ac.ac_ubleft < statstruct_size || error)
 			break;
 
 		if (end_of_ag) {
@@ -511,11 +514,17 @@ del_cursor:
 	 */
 	kmem_free(irbuf);
 	*ubcountp = ac.ac_ubelem;
+
 	/*
-	 * Found some inodes, return them now and return the error next time.
+	 * We found some inodes, so clear the error status and return them.
+	 * The lastino pointer will point directly at the inode that triggered
+	 * any error that occurred, so on the next call the error will be
+	 * triggered again and propagated to userspace as there will be no
+	 * formatted inodes in the buffer.
 	 */
 	if (ac.ac_ubelem)
-		rval = 0;
+		error = 0;
+
 	if (agno >= mp->m_sb.sb_agcount) {
 		/*
 		 * If we ran out of filesystem, mark lastino as off
@@ -527,7 +536,7 @@ del_cursor:
 	} else
 		*lastinop = (xfs_ino_t)lastino;
 
-	return rval;
+	return error;
 }
 
 int
-- 
cgit v1.2.3


From 002758992693ae63c04122603ea9261a0a58d728 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 7 Nov 2014 08:33:52 +1100
Subject: xfs: track bulkstat progress by agino

The bulkstat main loop progress is tracked by the "lastino"
variable, which is a full 64 bit inode. However, the loop actually
works on agno/agino pairs, and so there's a significant disconnect
between the rest of the loop and the main cursor. Convert this to
use the agino, and pass the agino into the chunk formatting function
and convert it too.

This gets rid of the inconsistency in the loop processing, and
finally makes it simple for us to skip inodes at any point in the
loop simply by incrementing the agino cursor.

cc: <stable@vger.kernel.org> # 3.17
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_itable.c | 71 +++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index ff3f431671b9..894924a5129b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -282,30 +282,31 @@ xfs_bulkstat_ag_ichunk(
 	bulkstat_one_pf			formatter,
 	size_t				statstruct_size,
 	struct xfs_bulkstat_agichunk	*acp,
-	xfs_ino_t			*lastino)
+	xfs_agino_t			*last_agino)
 {
 	char				__user **ubufp = acp->ac_ubuffer;
 	int				chunkidx;
 	int				error = 0;
-	xfs_agino_t			agino;
+	xfs_agino_t			agino = irbp->ir_startino;
 
-	agino = irbp->ir_startino;
 	for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
 	     chunkidx++, agino++) {
 		int		fmterror;
 		int		ubused;
-		xfs_ino_t	ino = XFS_AGINO_TO_INO(mp, agno, agino);
+
+		/* inode won't fit in buffer, we are done */
+		if (acp->ac_ubleft < statstruct_size)
+			break;
 
 		/* Skip if this inode is free */
-		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
-			*lastino = ino;
+		if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
 			continue;
-		}
 
 		/* Get the inode and fill in a single buffer */
 		ubused = statstruct_size;
-		error = formatter(mp, ino, *ubufp, acp->ac_ubleft,
-				  &ubused, &fmterror);
+		error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
+				  *ubufp, acp->ac_ubleft, &ubused, &fmterror);
+
 		if (fmterror == BULKSTAT_RV_GIVEUP ||
 		    (error && error != -ENOENT && error != -EINVAL)) {
 			acp->ac_ubleft = 0;
@@ -315,7 +316,6 @@ xfs_bulkstat_ag_ichunk(
 
 		/* be careful not to leak error if at end of chunk */
 		if (fmterror == BULKSTAT_RV_NOTHING || error) {
-			*lastino = ino;
 			error = 0;
 			continue;
 		}
@@ -323,12 +323,18 @@ xfs_bulkstat_ag_ichunk(
 		*ubufp += ubused;
 		acp->ac_ubleft -= ubused;
 		acp->ac_ubelem++;
-		*lastino = ino;
-
-		if (acp->ac_ubleft < statstruct_size)
-			break;
 	}
 
+	/*
+	 * Post-update *last_agino. At this point, agino will always point one
+	 * inode past the last inode we processed successfully. Hence we
+	 * substract that inode when setting the *last_agino cursor so that we
+	 * return the correct cookie to userspace. On the next bulkstat call,
+	 * the inode under the lastino cookie will be skipped as we have already
+	 * processed it here.
+	 */
+	*last_agino = agino - 1;
+
 	return error;
 }
 
@@ -352,7 +358,6 @@ xfs_bulkstat(
 	xfs_btree_cur_t		*cur;	/* btree cursor for ialloc btree */
 	size_t			irbsize; /* size of irec buffer in bytes */
 	xfs_inobt_rec_incore_t	*irbuf;	/* start of irec buffer */
-	xfs_ino_t		lastino; /* last inode number returned */
 	int			nirbuf;	/* size of irbuf */
 	int			ubcount; /* size of user's buffer */
 	struct xfs_bulkstat_agichunk ac;
@@ -361,11 +366,10 @@ xfs_bulkstat(
 	/*
 	 * Get the last inode value, see if there's nothing to do.
 	 */
-	lastino = *lastinop;
-	agno = XFS_INO_TO_AGNO(mp, lastino);
-	agino = XFS_INO_TO_AGINO(mp, lastino);
+	agno = XFS_INO_TO_AGNO(mp, *lastinop);
+	agino = XFS_INO_TO_AGINO(mp, *lastinop);
 	if (agno >= mp->m_sb.sb_agcount ||
-	    lastino != XFS_AGINO_TO_INO(mp, agno, agino)) {
+	    *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
 		*done = 1;
 		*ubcountp = 0;
 		return 0;
@@ -420,7 +424,6 @@ xfs_bulkstat(
 				irbp->ir_freecount = r.ir_freecount;
 				irbp->ir_free = r.ir_free;
 				irbp++;
-				agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			}
 			/* Increment to the next record */
 			error = xfs_btree_increment(cur, 0, &stat);
@@ -458,10 +461,6 @@ xfs_bulkstat(
 				irbp++;
 				icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
 			}
-			/*
-			 * Set agino to after this chunk and bump the cursor.
-			 */
-			agino = r.ir_startino + XFS_INODES_PER_CHUNK;
 			error = xfs_btree_increment(cur, 0, &stat);
 			if (error || stat == 0) {
 				end_of_ag = true;
@@ -481,7 +480,9 @@ del_cursor:
 		if (error)
 			break;
 		/*
-		 * Now format all the good inodes into the user's buffer.
+		 * Now format all the good inodes into the user's buffer. The
+		 * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
+		 * for the next loop iteration.
 		 */
 		irbufend = irbp;
 		for (irbp = irbuf;
@@ -489,7 +490,7 @@ del_cursor:
 		     irbp++) {
 			error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
 					formatter, statstruct_size, &ac,
-					&lastino);
+					&agino);
 			if (error)
 				break;
 
@@ -506,8 +507,7 @@ del_cursor:
 		if (end_of_ag) {
 			agno++;
 			agino = 0;
-		} else
-			agino = XFS_INO_TO_AGINO(mp, lastino);
+		}
 	}
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
@@ -525,16 +525,13 @@ del_cursor:
 	if (ac.ac_ubelem)
 		error = 0;
 
-	if (agno >= mp->m_sb.sb_agcount) {
-		/*
-		 * If we ran out of filesystem, mark lastino as off
-		 * the end of the filesystem, so the next call
-		 * will return immediately.
-		 */
-		*lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0);
+	/*
+	 * If we ran out of filesystem, lastino will point off the end of
+	 * the filesystem so the next call will return immediately.
+	 */
+	*lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
+	if (agno >= mp->m_sb.sb_agcount)
 		*done = 1;
-	} else
-		*lastinop = (xfs_ino_t)lastino;
 
 	return error;
 }
-- 
cgit v1.2.3


From 8c393f9a721c30a030049a680e1bf896669bb279 Mon Sep 17 00:00:00 2001
From: Peng Tao <tao.peng@primarydata.com>
Date: Wed, 5 Nov 2014 22:36:50 +0800
Subject: nfs: fix pnfs direct write memory leak

For pNFS direct writes, layout driver may dynamically allocate ds_cinfo.buckets.
So we need to take care to free them when freeing dreq.

Ideally this needs to be done inside layout driver where ds_cinfo.buckets
are allocated. But buckets are attached to dreq and reused across LD IO iterations.
So I feel it's OK to free them in the generic layer.

Cc: stable@vger.kernel.org [v3.4+]
Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/direct.c         |  1 +
 include/linux/nfs_xdr.h | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 20cffc830468..10bf07280f4a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -266,6 +266,7 @@ static void nfs_direct_req_free(struct kref *kref)
 {
 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
 
+	nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
 	if (dreq->l_ctx != NULL)
 		nfs_put_lock_context(dreq->l_ctx);
 	if (dreq->ctx != NULL)
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 983876f24aed..47ebb4fafd87 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1224,11 +1224,22 @@ struct nfs41_free_stateid_res {
 	unsigned int			status;
 };
 
+static inline void
+nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo)
+{
+	kfree(cinfo->buckets);
+}
+
 #else
 
 struct pnfs_ds_commit_info {
 };
 
+static inline void
+nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo)
+{
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #ifdef CONFIG_NFS_V4_2
-- 
cgit v1.2.3


From e0d4ed71ca0344494722a041780f004d2bcf0f11 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 26 Sep 2014 16:02:50 +0200
Subject: pnfs/blocklayout: serialize GETDEVICEINFO calls

The rpc_pipefs code isn't thread safe, leading to occasional use after
frees when running xfstests generic/241 (dbench).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/1411740170-18611-2-git-send-email-hch@lst.de
Cc: stable@vger.kernel.org # 3.17.x
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/rpc_pipefs.c | 14 +++++++++-----
 fs/nfs/netns.h                  |  1 +
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index e966c023b1b7..acbf9ca4018c 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -65,17 +65,18 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
 
 	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
 
+	mutex_lock(&nn->bl_mutex);
 	bl_pipe_msg.bl_wq = &nn->bl_wq;
 
 	b->simple.len += 4;	/* single volume */
 	if (b->simple.len > PAGE_SIZE)
-		return -EIO;
+		goto out_unlock;
 
 	memset(msg, 0, sizeof(*msg));
 	msg->len = sizeof(*bl_msg) + b->simple.len;
 	msg->data = kzalloc(msg->len, gfp_mask);
 	if (!msg->data)
-		goto out;
+		goto out_free_data;
 
 	bl_msg = msg->data;
 	bl_msg->type = BL_DEVICE_MOUNT,
@@ -87,7 +88,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
 	rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
 	if (rc < 0) {
 		remove_wait_queue(&nn->bl_wq, &wq);
-		goto out;
+		goto out_free_data;
 	}
 
 	set_current_state(TASK_UNINTERRUPTIBLE);
@@ -97,12 +98,14 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
 	if (reply->status != BL_DEVICE_REQUEST_PROC) {
 		printk(KERN_WARNING "%s failed to decode device: %d\n",
 			__func__, reply->status);
-		goto out;
+		goto out_free_data;
 	}
 
 	dev = MKDEV(reply->major, reply->minor);
-out:
+out_free_data:
 	kfree(msg->data);
+out_unlock:
+	mutex_unlock(&nn->bl_mutex);
 	return dev;
 }
 
@@ -232,6 +235,7 @@ static int nfs4blocklayout_net_init(struct net *net)
 	struct nfs_net *nn = net_generic(net, nfs_net_id);
 	struct dentry *dentry;
 
+	mutex_init(&nn->bl_mutex);
 	init_waitqueue_head(&nn->bl_wq);
 	nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
 	if (IS_ERR(nn->bl_device_pipe))
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index ef221fb8a183..f0e06e4acbef 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -19,6 +19,7 @@ struct nfs_net {
 	struct rpc_pipe *bl_device_pipe;
 	struct bl_dev_msg bl_mount_reply;
 	wait_queue_head_t bl_wq;
+	struct mutex bl_mutex;
 	struct list_head nfs_client_list;
 	struct list_head nfs_volume_list;
 #if IS_ENABLED(CONFIG_NFS_V4)
-- 
cgit v1.2.3


From 16c9914069536c77ed358d94b6e247bdc464b7f0 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Mon, 3 Nov 2014 15:19:45 -0500
Subject: nfs: remove spurious WARN_ON_ONCE in write path

This WARN_ON_ONCE was supposed to catch reference counting bugs, but can
trigger in inappropriate situations.

This was reproducible using NFSv2 on an architecture with 64K pages -- we
verified that it was not a reference counting bug and the warning was
safe to ignore.

Reported-by: Will Deacon <will.deacon@arm.com>
Tested-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/write.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 12493846a2d3..f83b02dc9166 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -715,8 +715,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 
 	if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
 		nfs_release_request(req);
-	else
-		WARN_ON_ONCE(1);
 }
 
 static void
-- 
cgit v1.2.3


From b283f9445214d4d573906f919c70caccd27b74ea Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 21 Oct 2014 13:32:10 +0200
Subject: nfs: Remove bogus assignment

Commit 3a6fd1f004fc (pnfs/blocklayout: remove read-modify-write handling
in bl_write_pagelist) introduced a bogus assignment pg_index = pg_index
in variable initialization. AFAICS it's just a typo so remove it.
Spotted by Coverity (id 1248711).

CC: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/blocklayout/blocklayout.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 5228f201d3d5..4f46f7a05289 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -378,7 +378,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	loff_t offset = header->args.offset;
 	size_t count = header->args.count;
 	struct page **pages = header->args.pages;
-	int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+	int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
 	unsigned int pg_len;
 	struct blk_plug plug;
 	int i;
-- 
cgit v1.2.3


From 16caf5b6101d03335b386e77e9e14136f989be87 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Oct 2014 14:02:47 +0200
Subject: nfs: Fix use of uninitialized variable in nfs_getattr()

Variable 'err' needn't be initialized when nfs_getattr() uses it to
check whether it should call generic_fillattr() or not. That can result
in spurious error returns. Initialize 'err' properly.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6388a59f2add..00689a8a85e4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -626,7 +626,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
 	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
-	int err;
+	int err = 0;
 
 	trace_nfs_getattr_enter(inode);
 	/* Flush out writes to the server in order to update c/mtime.  */
-- 
cgit v1.2.3


From e983120e923aa1c5d2aaf528331c298c88f3ab85 Mon Sep 17 00:00:00 2001
From: Anna Schumaker <Anna.Schumaker@netapp.com>
Date: Wed, 22 Oct 2014 15:53:10 -0400
Subject: NFS: SEEK is an NFS v4.2 feature

Somehow the nfs_v4_1_minor_ops had the NFS_CAP_SEEK flag set, enabling
SEEK over v4.1.  This is wrong, and can make servers crash.

Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
Tested-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 41b8fcbfdadd..dca174ce8309 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -8397,8 +8397,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
-		| NFS_CAP_ATOMIC_OPEN_V1
-		| NFS_CAP_SEEK,
+		| NFS_CAP_ATOMIC_OPEN_V1,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
@@ -8420,7 +8419,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
 		| NFS_CAP_STATEID_NFSV41
-		| NFS_CAP_ATOMIC_OPEN_V1,
+		| NFS_CAP_ATOMIC_OPEN_V1
+		| NFS_CAP_SEEK,
 	.init_client = nfs41_init_client,
 	.shutdown_client = nfs41_shutdown_client,
 	.match_stateid = nfs41_match_stateid,
-- 
cgit v1.2.3


From 4dfd4f7af0afd201706ad186352ca423b0f17d4b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 17 Oct 2014 15:10:25 +0300
Subject: NFSv4: Ensure that we remove NFSv4.0 delegations when state has
 expired

NFSv4.0 does not have TEST_STATEID/FREE_STATEID functionality, so
unlike NFSv4.1, the recovery procedure when stateids have expired or
have been revoked requires us to just forget the delegation.

http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dca174ce8309..bdd880bddba4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2109,6 +2109,28 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
 	return ret;
 }
 
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+{
+	nfs_remove_bad_delegation(state->inode);
+	write_seqlock(&state->seqlock);
+	nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+	write_sequnlock(&state->seqlock);
+	clear_bit(NFS_DELEGATED_STATE, &state->flags);
+}
+
+static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
+{
+	if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
+		nfs_finish_clear_delegation_stateid(state);
+}
+
+static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
+{
+	/* NFSv4.0 doesn't allow for delegation recovery on open expire */
+	nfs40_clear_delegation_stateid(state);
+	return nfs4_open_expired(sp, state);
+}
+
 #if defined(CONFIG_NFS_V4_1)
 static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
 {
@@ -8330,7 +8352,7 @@ static const struct nfs4_state_recovery_ops nfs41_reboot_recovery_ops = {
 static const struct nfs4_state_recovery_ops nfs40_nograce_recovery_ops = {
 	.owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
 	.state_flag_bit	= NFS_STATE_RECLAIM_NOGRACE,
-	.recover_open	= nfs4_open_expired,
+	.recover_open	= nfs40_open_expired,
 	.recover_lock	= nfs4_lock_expired,
 	.establish_clid = nfs4_init_clientid,
 };
-- 
cgit v1.2.3


From 0c116cadd94b16b30b1dd90d38b2784d9b39b01a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Wed, 12 Nov 2014 14:44:49 -0500
Subject: NFSv4.1: nfs41_clear_delegation_stateid shouldn't trust
 NFS_DELEGATED_STATE

This patch removes the assumption made previously, that we only need to
check the delegation stateid when it matches the stateid on a cached
open.

If we believe that we hold a delegation for this file, then we must assume
that its stateid may have been revoked or expired too. If we don't test it
then our state recovery process may end up caching open/lock state in a
situation where it should not.
We therefore rename the function nfs41_clear_delegation_stateid as
nfs41_check_delegation_stateid, and change it to always run through the
delegation stateid test and recovery process as outlined in RFC5661.

http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bdd880bddba4..3b98fe752ef8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2132,45 +2132,37 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 }
 
 #if defined(CONFIG_NFS_V4_1)
-static void nfs41_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs41_check_delegation_stateid(struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
-	nfs4_stateid *stateid = &state->stateid;
+	nfs4_stateid stateid;
 	struct nfs_delegation *delegation;
-	struct rpc_cred *cred = NULL;
-	int status = -NFS4ERR_BAD_STATEID;
-
-	/* If a state reset has been done, test_stateid is unneeded */
-	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
-		return;
+	struct rpc_cred *cred;
+	int status;
 
 	/* Get the delegation credential for use by test/free_stateid */
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(state->inode)->delegation);
-	if (delegation != NULL &&
-	    nfs4_stateid_match(&delegation->stateid, stateid)) {
-		cred = get_rpccred(delegation->cred);
-		rcu_read_unlock();
-		status = nfs41_test_stateid(server, stateid, cred);
-		trace_nfs4_test_delegation_stateid(state, NULL, status);
-	} else
+	if (delegation == NULL) {
 		rcu_read_unlock();
+		return;
+	}
+
+	nfs4_stateid_copy(&stateid, &delegation->stateid);
+	cred = get_rpccred(delegation->cred);
+	rcu_read_unlock();
+	status = nfs41_test_stateid(server, &stateid, cred);
+	trace_nfs4_test_delegation_stateid(state, NULL, status);
 
 	if (status != NFS_OK) {
 		/* Free the stateid unless the server explicitly
 		 * informs us the stateid is unrecognized. */
 		if (status != -NFS4ERR_BAD_STATEID)
-			nfs41_free_stateid(server, stateid, cred);
-		nfs_remove_bad_delegation(state->inode);
-
-		write_seqlock(&state->seqlock);
-		nfs4_stateid_copy(&state->stateid, &state->open_stateid);
-		write_sequnlock(&state->seqlock);
-		clear_bit(NFS_DELEGATED_STATE, &state->flags);
+			nfs41_free_stateid(server, &stateid, cred);
+		nfs_finish_clear_delegation_stateid(state);
 	}
 
-	if (cred != NULL)
-		put_rpccred(cred);
+	put_rpccred(cred);
 }
 
 /**
@@ -2214,7 +2206,7 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 {
 	int status;
 
-	nfs41_clear_delegation_stateid(state);
+	nfs41_check_delegation_stateid(state);
 	status = nfs41_check_open_stateid(state);
 	if (status != NFS_OK)
 		status = nfs4_open_expired(sp, state);
-- 
cgit v1.2.3


From 869f9dfa4d6d57b79e0afc3af14772c2a023eeb1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Mon, 10 Nov 2014 18:43:56 -0500
Subject: NFSv4: Fix races between nfs_remove_bad_delegation() and delegation
 return

Any attempt to call nfs_remove_bad_delegation() while a delegation is being
returned is currently a no-op. This means that we can end up looping
forever in nfs_end_delegation_return() if something causes the delegation
to be revoked.
This patch adds a mechanism whereby the state recovery code can communicate
to the delegation return code that the delegation is no longer valid and
that it should not be used when reclaiming state.
It also changes the return value for nfs4_handle_delegation_recall_error()
to ensure that nfs_end_delegation_return() does not reattempt the lock
reclaim before state recovery is done.

http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/delegation.c | 23 +++++++++++++++++++++--
 fs/nfs/delegation.h |  1 +
 fs/nfs/nfs4proc.c   |  2 +-
 3 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5853f53db732..e5f473d13e24 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -193,7 +193,11 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
 {
 	int res = 0;
 
-	res = nfs4_proc_delegreturn(inode, delegation->cred, &delegation->stateid, issync);
+	if (!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+		res = nfs4_proc_delegreturn(inode,
+				delegation->cred,
+				&delegation->stateid,
+				issync);
 	nfs_free_delegation(delegation);
 	return res;
 }
@@ -380,11 +384,13 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
 {
 	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int err;
+	int err = 0;
 
 	if (delegation == NULL)
 		return 0;
 	do {
+		if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+			break;
 		err = nfs_delegation_claim_opens(inode, &delegation->stateid);
 		if (!issync || err != -EAGAIN)
 			break;
@@ -605,10 +611,23 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
 	rcu_read_unlock();
 }
 
+static void nfs_revoke_delegation(struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation != NULL) {
+		set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+		nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
+	}
+	rcu_read_unlock();
+}
+
 void nfs_remove_bad_delegation(struct inode *inode)
 {
 	struct nfs_delegation *delegation;
 
+	nfs_revoke_delegation(inode);
 	delegation = nfs_inode_detach_delegation(inode);
 	if (delegation) {
 		nfs_inode_find_state_and_recover(inode, &delegation->stateid);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 5c1cce39297f..e3c20a3ccc93 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -31,6 +31,7 @@ enum {
 	NFS_DELEGATION_RETURN_IF_CLOSED,
 	NFS_DELEGATION_REFERENCED,
 	NFS_DELEGATION_RETURNING,
+	NFS_DELEGATION_REVOKED,
 };
 
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3b98fe752ef8..4b7166f4e1cf 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1654,7 +1654,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 			nfs_inode_find_state_and_recover(state->inode,
 					stateid);
 			nfs4_schedule_stateid_recovery(server, state);
-			return 0;
+			return -EAGAIN;
 		case -NFS4ERR_DELAY:
 		case -NFS4ERR_GRACE:
 			set_bit(NFS_DELEGATED_STATE, &state->flags);
-- 
cgit v1.2.3


From c606bb8857921d3ecf4d353942d6cc7e116cc75a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 17 Oct 2014 15:15:13 +0300
Subject: NFSv4: Ensure that we call FREE_STATEID when NFSv4.x stateids are
 revoked

NFSv4.x (x>0) requires us to call TEST_STATEID+FREE_STATEID if a stateid is
revoked. We will currently fail to do this if the stateid is a delegation.

http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/filelayout/filelayout.c | 3 ---
 fs/nfs/nfs4proc.c              | 8 --------
 2 files changed, 11 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 46fab1cb455a..7afb52f6a25a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -145,9 +145,6 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 	case -NFS4ERR_DELEG_REVOKED:
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_BAD_STATEID:
-		if (state == NULL)
-			break;
-		nfs_remove_bad_delegation(state->inode);
 	case -NFS4ERR_OPENMODE:
 		if (state == NULL)
 			break;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4b7166f4e1cf..69dc20a743f9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -370,11 +370,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
-			if (inode != NULL && nfs4_have_delegation(inode, FMODE_READ)) {
-				nfs_remove_bad_delegation(inode);
-				exception->retry = 1;
-				break;
-			}
 			if (state == NULL)
 				break;
 			ret = nfs4_schedule_stateid_recovery(server, state);
@@ -4844,9 +4839,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_BAD_STATEID:
-			if (state == NULL)
-				break;
-			nfs_remove_bad_delegation(state->inode);
 		case -NFS4ERR_OPENMODE:
 			if (state == NULL)
 				break;
-- 
cgit v1.2.3


From f8ebf7a8ca35dde321f0cd385fee6f1950609367 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Fri, 17 Oct 2014 23:02:52 +0300
Subject: NFS: Don't try to reclaim delegation open state if recovery failed

If state recovery failed, then we should not attempt to reclaim delegated
state.

http://lkml.kernel.org/r/CAN-5tyHwG=Cn2Q9KsHWadewjpTTy_K26ee+UnSvHvG4192p-Xw@mail.gmail.com
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/delegation.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index e5f473d13e24..7f3f60641344 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -125,6 +125,8 @@ again:
 			continue;
 		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
 			continue;
+		if (!nfs4_valid_open_stateid(state))
+			continue;
 		if (!nfs4_stateid_match(&state->stateid, stateid))
 			continue;
 		get_nfs_open_context(ctx);
-- 
cgit v1.2.3


From 3231300bb986947a6b74e7075d84a2f434e4d788 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 22 Oct 2014 17:13:26 -0700
Subject: ceph: fix flush tid comparision

TID of cap flush ack is 64 bits, but ceph_inode_info::flushing_cap_tid
is only 16 bits. 16 bits should be plenty to let the cap flush updates
pipeline appropriately, but we need to cast in the proper direction when
comparing these differently-sized versions. So downcast the 64-bits one
to 16 bits.

Reflects ceph.git commit a5184cf46a6e867287e24aeb731634828467cd98.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Reviewed-by: Ilya Dryomov <idryomov@redhat.com>
---
 fs/ceph/caps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 659f2ea9e6f7..cefca661464b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2638,7 +2638,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 
 	for (i = 0; i < CEPH_CAP_BITS; i++)
 		if ((dirty & (1 << i)) &&
-		    flush_tid == ci->i_cap_flush_tid[i])
+		    (u16)flush_tid == ci->i_cap_flush_tid[i])
 			cleaned |= 1 << i;
 
 	dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
-- 
cgit v1.2.3


From 8edc6e1688fc8f02c8c1f53a2ec4928cb1055f4d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 13 Nov 2014 15:19:33 -0800
Subject: fanotify: fix notification of groups with inode & mount marks

fsnotify() needs to merge inode and mount marks lists when notifying
groups about events so that ignore masks from inode marks are reflected
in mount mark notifications and groups are notified in proper order
(according to priorities).

Currently the sorting of the lists done by fsnotify_add_inode_mark() /
fsnotify_add_vfsmount_mark() and fsnotify() differed which resulted
ignore masks not being used in some cases.

Fix the problem by always using the same comparison function when
sorting / merging the mark lists.

Thanks to Heinrich Schuchardt for improvements of my patch.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=87721
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Tested-by: Heinrich Schuchardt <xypron.glpk@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fsnotify.c      | 36 +++++++++++++++++++++---------------
 fs/notify/fsnotify.h      |  4 ++++
 fs/notify/inode_mark.c    |  8 +++-----
 fs/notify/mark.c          | 36 ++++++++++++++++++++++++++++++++++++
 fs/notify/vfsmount_mark.c |  8 +++-----
 5 files changed, 67 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9d3e9c50066a..89326acd4561 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -229,8 +229,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 					      &fsnotify_mark_srcu);
 	}
 
+	/*
+	 * We need to merge inode & vfsmount mark lists so that inode mark
+	 * ignore masks are properly reflected for mount mark notifications.
+	 * That's why this traversal is so complicated...
+	 */
 	while (inode_node || vfsmount_node) {
-		inode_group = vfsmount_group = NULL;
+		inode_group = NULL;
+		inode_mark = NULL;
+		vfsmount_group = NULL;
+		vfsmount_mark = NULL;
 
 		if (inode_node) {
 			inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
@@ -244,21 +252,19 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 			vfsmount_group = vfsmount_mark->group;
 		}
 
-		if (inode_group > vfsmount_group) {
-			/* handle inode */
-			ret = send_to_group(to_tell, inode_mark, NULL, mask,
-					    data, data_is, cookie, file_name);
-			/* we didn't use the vfsmount_mark */
-			vfsmount_group = NULL;
-		} else if (vfsmount_group > inode_group) {
-			ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
-					    data, data_is, cookie, file_name);
-			inode_group = NULL;
-		} else {
-			ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
-					    mask, data, data_is, cookie,
-					    file_name);
+		if (inode_group && vfsmount_group) {
+			int cmp = fsnotify_compare_groups(inode_group,
+							  vfsmount_group);
+			if (cmp > 0) {
+				inode_group = NULL;
+				inode_mark = NULL;
+			} else if (cmp < 0) {
+				vfsmount_group = NULL;
+				vfsmount_mark = NULL;
+			}
 		}
+		ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
+				    data, data_is, cookie, file_name);
 
 		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
 			goto out;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 9c0898c4cfe1..3b68b0ae0a97 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,6 +12,10 @@ extern void fsnotify_flush_notify(struct fsnotify_group *group);
 /* protects reads of inode and vfsmount marks list */
 extern struct srcu_struct fsnotify_mark_srcu;
 
+/* compare two groups for sorting of marks lists */
+extern int fsnotify_compare_groups(struct fsnotify_group *a,
+				   struct fsnotify_group *b);
+
 extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
 						__u32 mask);
 /* add a mark to an inode */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e8497144b323..dfbf5447eea4 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -194,6 +194,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 {
 	struct fsnotify_mark *lmark, *last = NULL;
 	int ret = 0;
+	int cmp;
 
 	mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
 
@@ -219,11 +220,8 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 			goto out;
 		}
 
-		if (mark->group->priority < lmark->group->priority)
-			continue;
-
-		if ((mark->group->priority == lmark->group->priority) &&
-		    (mark->group < lmark->group))
+		cmp = fsnotify_compare_groups(lmark->group, mark->group);
+		if (cmp < 0)
 			continue;
 
 		hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d90deaa08e78..34c38fabf514 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -209,6 +209,42 @@ void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mas
 	mark->ignored_mask = mask;
 }
 
+/*
+ * Sorting function for lists of fsnotify marks.
+ *
+ * Fanotify supports different notification classes (reflected as priority of
+ * notification group). Events shall be passed to notification groups in
+ * decreasing priority order. To achieve this marks in notification lists for
+ * inodes and vfsmounts are sorted so that priorities of corresponding groups
+ * are descending.
+ *
+ * Furthermore correct handling of the ignore mask requires processing inode
+ * and vfsmount marks of each group together. Using the group address as
+ * further sort criterion provides a unique sorting order and thus we can
+ * merge inode and vfsmount lists of marks in linear time and find groups
+ * present in both lists.
+ *
+ * A return value of 1 signifies that b has priority over a.
+ * A return value of 0 signifies that the two marks have to be handled together.
+ * A return value of -1 signifies that a has priority over b.
+ */
+int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
+{
+	if (a == b)
+		return 0;
+	if (!a)
+		return 1;
+	if (!b)
+		return -1;
+	if (a->priority < b->priority)
+		return 1;
+	if (a->priority > b->priority)
+		return -1;
+	if (a < b)
+		return 1;
+	return -1;
+}
+
 /*
  * Attach an initialized mark to a given group and fs object.
  * These marks may be used for the fsnotify backend to determine which
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index ac851e8376b1..faefa72a11eb 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -153,6 +153,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 	struct mount *m = real_mount(mnt);
 	struct fsnotify_mark *lmark, *last = NULL;
 	int ret = 0;
+	int cmp;
 
 	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
 
@@ -178,11 +179,8 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 			goto out;
 		}
 
-		if (mark->group->priority < lmark->group->priority)
-			continue;
-
-		if ((mark->group->priority == lmark->group->priority) &&
-		    (mark->group < lmark->group))
+		cmp = fsnotify_compare_groups(lmark->group, mark->group);
+		if (cmp < 0)
 			continue;
 
 		hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list);
-- 
cgit v1.2.3


From 4a7795d35e252f38298980530e01e21867f8f856 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zyan@redhat.com>
Date: Wed, 19 Nov 2014 15:50:34 +0800
Subject: vfs: fix reference leak in d_prune_aliases()

In "d_prune_alias(): just lock the parent and call __dentry_kill()" the old
dget + d_drop + dput has been replaced with lock_parent + __dentry_kill;
unfortunately, dput() does more than just killing dentry - it also drops the
reference to parent.  New variant leaks that reference and needs dput(parent)
after killing the child off.

Signed-off-by: Yan, Zheng <zyan@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 3ffef7f4e5cd..5bc72b07fde2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -778,6 +778,7 @@ restart:
 			struct dentry *parent = lock_parent(dentry);
 			if (likely(!dentry->d_lockref.count)) {
 				__dentry_kill(dentry);
+				dput(parent);
 				goto restart;
 			}
 			if (parent)
-- 
cgit v1.2.3


From 7ca2f234404e738cc807ed87c43f9932513bc8c6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Nov 2014 15:11:24 +0100
Subject: isofs: avoid unused function warning

With the isofs_hash() function removed, isofs_hash_ms() is the only user
of isofs_hash_common(), but it's defined inside of an #ifdef, which triggers
this gcc warning in ARM axm55xx_defconfig starting with v3.18-rc3:

fs/isofs/inode.c:177:1: warning: 'isofs_hash_common' defined but not used [-Wunused-function]

This patch moves the function inside of the same #ifdef section to avoid that
warning, which seems the best compromise of a relatively harmless patch for
a late -rc.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: b0afd8e5db7b ("isofs: don't bother with ->d_op for normal case")
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/isofs/inode.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index fe839b915116..d67a16f2a45d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -170,27 +170,6 @@ struct iso9660_options{
 	s32 sbsector;
 };
 
-/*
- * Compute the hash for the isofs name corresponding to the dentry.
- */
-static int
-isofs_hash_common(struct qstr *qstr, int ms)
-{
-	const char *name;
-	int len;
-
-	len = qstr->len;
-	name = qstr->name;
-	if (ms) {
-		while (len && name[len-1] == '.')
-			len--;
-	}
-
-	qstr->hash = full_name_hash(name, len);
-
-	return 0;
-}
-
 /*
  * Compute the hash for the isofs name corresponding to the dentry.
  */
@@ -263,6 +242,27 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
 }
 
 #ifdef CONFIG_JOLIET
+/*
+ * Compute the hash for the isofs name corresponding to the dentry.
+ */
+static int
+isofs_hash_common(struct qstr *qstr, int ms)
+{
+	const char *name;
+	int len;
+
+	len = qstr->len;
+	name = qstr->name;
+	if (ms) {
+		while (len && name[len-1] == '.')
+			len--;
+	}
+
+	qstr->hash = full_name_hash(name, len);
+
+	return 0;
+}
+
 static int
 isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
 {
-- 
cgit v1.2.3


From f82c458a2c3ffb94b431fc6ad791a79df1b3713e Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Wed, 19 Nov 2014 10:25:09 -0800
Subject: btrfs: fix lockups from btrfs_clear_path_blocking

The fair reader/writer locks mean that btrfs_clear_path_blocking needs
to strictly follow lock ordering rules even when we already have
blocking locks on a given path.

Before we can clear a blocking lock on the path, we need to make sure
all of the locks have been converted to blocking.  This will remove lock
inversions against anyone spinning in write_lock() against the buffers
we're trying to get read locks on.  These inversions didn't exist before
the fair read/writer locks, but now we need to be more careful.

We papered over this deadlock in the past by changing
btrfs_try_read_lock() to be a true trylock against both the spinlock and
the blocking lock.  This was slower, and not sufficient to fix all the
deadlocks.  This patch adds a btrfs_tree_read_lock_atomic(), which
basically means get the spinlock but trylock on the blocking lock.

Signed-off-by: Chris Mason <clm@fb.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Reported-by: Patrick Schmid <schmid@phys.ethz.ch>
cc: stable@vger.kernel.org #v3.15+
---
 fs/btrfs/ctree.c   | 14 ++------------
 fs/btrfs/locking.c | 24 +++++++++++++++++++++---
 fs/btrfs/locking.h |  2 ++
 3 files changed, 25 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19bc6162fb8e..150822ee0a0b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -80,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 {
 	int i;
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	/* lockdep really cares that we take all of these spinlocks
-	 * in the right order.  If any of the locks in the path are not
-	 * currently blocking, it is going to complain.  So, make really
-	 * really sure by forcing the path to blocking before we clear
-	 * the path blocking.
-	 */
 	if (held) {
 		btrfs_set_lock_blocking_rw(held, held_rw);
 		if (held_rw == BTRFS_WRITE_LOCK)
@@ -95,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 			held_rw = BTRFS_READ_LOCK_BLOCKING;
 	}
 	btrfs_set_path_blocking(p);
-#endif
 
 	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
 		if (p->nodes[i] && p->locks[i]) {
@@ -107,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 		}
 	}
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 	if (held)
 		btrfs_clear_lock_blocking_rw(held, held_rw);
-#endif
 }
 
 /* this also releases the path */
@@ -2893,7 +2883,7 @@ cow_done:
 					}
 					p->locks[level] = BTRFS_WRITE_LOCK;
 				} else {
-					err = btrfs_try_tree_read_lock(b);
+					err = btrfs_tree_read_lock_atomic(b);
 					if (!err) {
 						btrfs_set_path_blocking(p);
 						btrfs_tree_read_lock(b);
@@ -3025,7 +3015,7 @@ again:
 			}
 
 			level = btrfs_header_level(b);
-			err = btrfs_try_tree_read_lock(b);
+			err = btrfs_tree_read_lock_atomic(b);
 			if (!err) {
 				btrfs_set_path_blocking(p);
 				btrfs_tree_read_lock(b);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 5665d2149249..f8229ef1b46d 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -127,6 +127,26 @@ again:
 	atomic_inc(&eb->spinning_readers);
 }
 
+/*
+ * take a spinning read lock.
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
+ */
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
+{
+	if (atomic_read(&eb->blocking_writers))
+		return 0;
+
+	read_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_writers)) {
+		read_unlock(&eb->lock);
+		return 0;
+	}
+	atomic_inc(&eb->read_locks);
+	atomic_inc(&eb->spinning_readers);
+	return 1;
+}
+
 /*
  * returns 1 if we get the read lock and 0 if we don't
  * this won't wait for blocking writers
@@ -158,9 +178,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
 	    atomic_read(&eb->blocking_readers))
 		return 0;
 
-	if (!write_trylock(&eb->lock))
-		return 0;
-
+	write_lock(&eb->lock);
 	if (atomic_read(&eb->blocking_writers) ||
 	    atomic_read(&eb->blocking_readers)) {
 		write_unlock(&eb->lock);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index b81e0e9a4894..c44a9d5f5362 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -35,6 +35,8 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
 void btrfs_assert_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_read_lock(struct extent_buffer *eb);
 int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+
 
 static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
 {
-- 
cgit v1.2.3


From ef94b1864d1ed5be54376404bb23d22ed0481feb Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 20 Nov 2014 16:39:59 +0100
Subject: ovl: rename filesystem type to "overlay"

Some distributions carry an "old" format of overlayfs while mainline has a
"new" format.

The distros will possibly want to keep the old overlayfs alongside the new
for compatibility reasons.

To make it possible to differentiate the two versions change the name of
the new one from "overlayfs" to "overlay".

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Cc: Andy Whitcroft <apw@canonical.com>
---
 Documentation/filesystems/overlayfs.txt | 2 +-
 MAINTAINERS                             | 2 +-
 fs/Makefile                             | 2 +-
 fs/overlayfs/Kconfig                    | 2 +-
 fs/overlayfs/Makefile                   | 4 ++--
 fs/overlayfs/super.c                    | 6 +++---
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index 530850a72735..a27c950ece61 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -64,7 +64,7 @@ is formed.
 At mount time, the two directories given as mount options "lowerdir" and
 "upperdir" are combined into a merged directory:
 
-  mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper,\
+  mount -t overlay overlay -olowerdir=/lower,upperdir=/upper,\
 workdir=/work /merged
 
 The "workdir" needs to be an empty directory on the same filesystem
diff --git a/MAINTAINERS b/MAINTAINERS
index c444907ccd69..af1e738e8772 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6888,7 +6888,7 @@ F:	drivers/scsi/osd/
 F:	include/scsi/osd_*
 F:	fs/exofs/
 
-OVERLAYFS FILESYSTEM
+OVERLAY FILESYSTEM
 M:	Miklos Szeredi <miklos@szeredi.hu>
 L:	linux-fsdevel@vger.kernel.org
 S:	Supported
diff --git a/fs/Makefile b/fs/Makefile
index 34a1b9dea6dd..da0bbb456d3f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -104,7 +104,7 @@ obj-$(CONFIG_QNX6FS_FS)		+= qnx6/
 obj-$(CONFIG_AUTOFS4_FS)	+= autofs4/
 obj-$(CONFIG_ADFS_FS)		+= adfs/
 obj-$(CONFIG_FUSE_FS)		+= fuse/
-obj-$(CONFIG_OVERLAYFS_FS)	+= overlayfs/
+obj-$(CONFIG_OVERLAY_FS)	+= overlayfs/
 obj-$(CONFIG_UDF_FS)		+= udf/
 obj-$(CONFIG_SUN_OPENPROMFS)	+= openpromfs/
 obj-$(CONFIG_OMFS_FS)		+= omfs/
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index e60125976873..34355818a2e0 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -1,4 +1,4 @@
-config OVERLAYFS_FS
+config OVERLAY_FS
 	tristate "Overlay filesystem support"
 	help
 	  An overlay filesystem combines two filesystems - an 'upper' filesystem
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
index 8f91889480d0..900daed3e91d 100644
--- a/fs/overlayfs/Makefile
+++ b/fs/overlayfs/Makefile
@@ -2,6 +2,6 @@
 # Makefile for the overlay filesystem.
 #
 
-obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o
+obj-$(CONFIG_OVERLAY_FS) += overlay.o
 
-overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o
+overlay-objs := super.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 08b704cebfc4..b92bd1829cf7 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -24,7 +24,7 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Overlay filesystem");
 MODULE_LICENSE("GPL");
 
-#define OVERLAYFS_SUPER_MAGIC 0x794c764f
+#define OVERLAYFS_SUPER_MAGIC 0x794c7630
 
 struct ovl_config {
 	char *lowerdir;
@@ -776,11 +776,11 @@ static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
 
 static struct file_system_type ovl_fs_type = {
 	.owner		= THIS_MODULE,
-	.name		= "overlayfs",
+	.name		= "overlay",
 	.mount		= ovl_mount,
 	.kill_sb	= kill_anon_super,
 };
-MODULE_ALIAS_FS("overlayfs");
+MODULE_ALIAS_FS("overlay");
 
 static int __init ovl_init(void)
 {
-- 
cgit v1.2.3


From a105d685a8483985a01776411de191a726b48132 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 20 Nov 2014 16:39:59 +0100
Subject: ovl: fix remove/copy-up race

ovl_remove_and_whiteout() needs to check if upper dentry exists or not
after having locked upper parent directory.

Previously we used a "type" value computed before locking the upper parent
directory, which is susceptible to racing with copy-up.

There's a similar check in ovl_check_empty_and_clear().  This one is not
actually racy, since copy-up doesn't change the "emptyness" property of a
directory.  Add a comment to this effect, and check the existence of upper
dentry locally to make the code cleaner.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/overlayfs/dir.c | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 15cd91ad9940..8ffc4b980f1b 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -284,8 +284,7 @@ out:
 	return ERR_PTR(err);
 }
 
-static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry,
-						enum ovl_path_type type)
+static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
 {
 	int err;
 	struct dentry *ret = NULL;
@@ -294,8 +293,17 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry,
 	err = ovl_check_empty_dir(dentry, &list);
 	if (err)
 		ret = ERR_PTR(err);
-	else if (type == OVL_PATH_MERGE)
-		ret = ovl_clear_empty(dentry, &list);
+	else {
+		/*
+		 * If no upperdentry then skip clearing whiteouts.
+		 *
+		 * Can race with copy-up, since we don't hold the upperdir
+		 * mutex.  Doesn't matter, since copy-up can't create a
+		 * non-empty directory from an empty one.
+		 */
+		if (ovl_dentry_upper(dentry))
+			ret = ovl_clear_empty(dentry, &list);
+	}
 
 	ovl_cache_free(&list);
 
@@ -487,8 +495,7 @@ out:
 	return err;
 }
 
-static int ovl_remove_and_whiteout(struct dentry *dentry,
-				   enum ovl_path_type type, bool is_dir)
+static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
 {
 	struct dentry *workdir = ovl_workdir(dentry);
 	struct inode *wdir = workdir->d_inode;
@@ -500,7 +507,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
 	int err;
 
 	if (is_dir) {
-		opaquedir = ovl_check_empty_and_clear(dentry, type);
+		opaquedir = ovl_check_empty_and_clear(dentry);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir))
 			goto out;
@@ -515,9 +522,10 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
 	if (IS_ERR(whiteout))
 		goto out_unlock;
 
-	if (type == OVL_PATH_LOWER) {
+	upper = ovl_dentry_upper(dentry);
+	if (!upper) {
 		upper = lookup_one_len(dentry->d_name.name, upperdir,
-					   dentry->d_name.len);
+				       dentry->d_name.len);
 		err = PTR_ERR(upper);
 		if (IS_ERR(upper))
 			goto kill_whiteout;
@@ -529,7 +537,6 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
 	} else {
 		int flags = 0;
 
-		upper = ovl_dentry_upper(dentry);
 		if (opaquedir)
 			upper = opaquedir;
 		err = -ESTALE;
@@ -648,7 +655,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 		cap_raise(override_cred->cap_effective, CAP_CHOWN);
 		old_cred = override_creds(override_cred);
 
-		err = ovl_remove_and_whiteout(dentry, type, is_dir);
+		err = ovl_remove_and_whiteout(dentry, is_dir);
 
 		revert_creds(old_cred);
 		put_cred(override_cred);
@@ -781,7 +788,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	}
 
 	if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) {
-		opaquedir = ovl_check_empty_and_clear(new, new_type);
+		opaquedir = ovl_check_empty_and_clear(new);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir)) {
 			opaquedir = NULL;
-- 
cgit v1.2.3


From 521484639ec19a6f1ed56de6993feb255f5f676c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 20 Nov 2014 16:40:00 +0100
Subject: ovl: fix race in private xattr checks

Xattr operations can race with copy up.  This does not matter as long as
we consistently fiter out "trunsted.overlay.opaque" attribute on upper
directories.

Previously we checked parent against OVL_PATH_MERGE.  This is too general,
and prone to race with copy-up.  I.e. we found the parent to be on the
lower layer but ovl_dentry_real() would return the copied-up dentry,
possibly with the "opaque" attribute.

So instead use ovl_path_real() and decide to filter the attributes based on
the actual type of the dentry we'll use.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/overlayfs/inode.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index af2d18c9fcee..07d74b24913b 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -235,26 +235,36 @@ out:
 	return err;
 }
 
+static bool ovl_need_xattr_filter(struct dentry *dentry,
+				  enum ovl_path_type type)
+{
+	return type == OVL_PATH_UPPER && S_ISDIR(dentry->d_inode->i_mode);
+}
+
 ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
 		     void *value, size_t size)
 {
-	if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
-	    ovl_is_private_xattr(name))
+	struct path realpath;
+	enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+
+	if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
 		return -ENODATA;
 
-	return vfs_getxattr(ovl_dentry_real(dentry), name, value, size);
+	return vfs_getxattr(realpath.dentry, name, value, size);
 }
 
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
 {
+	struct path realpath;
+	enum ovl_path_type type = ovl_path_real(dentry, &realpath);
 	ssize_t res;
 	int off;
 
-	res = vfs_listxattr(ovl_dentry_real(dentry), list, size);
+	res = vfs_listxattr(realpath.dentry, list, size);
 	if (res <= 0 || size == 0)
 		return res;
 
-	if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE)
+	if (!ovl_need_xattr_filter(dentry, type))
 		return res;
 
 	/* filter out private xattrs */
@@ -279,17 +289,16 @@ int ovl_removexattr(struct dentry *dentry, const char *name)
 {
 	int err;
 	struct path realpath;
-	enum ovl_path_type type;
+	enum ovl_path_type type = ovl_path_real(dentry, &realpath);
 
 	err = ovl_want_write(dentry);
 	if (err)
 		goto out;
 
-	if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE &&
-	    ovl_is_private_xattr(name))
+	err = -ENODATA;
+	if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
 		goto out_drop_write;
 
-	type = ovl_path_real(dentry, &realpath);
 	if (type == OVL_PATH_LOWER) {
 		err = vfs_getxattr(realpath.dentry, name, NULL, 0);
 		if (err < 0)
-- 
cgit v1.2.3


From 91c77947133f7aef851b625701e182d3f99d14a9 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 20 Nov 2014 16:40:00 +0100
Subject: ovl: allow filenames with comma

Allow option separator (comma) to be escaped with backslash.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/overlayfs/super.c | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b92bd1829cf7..eee7a62e1c0e 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -462,11 +462,34 @@ static const match_table_t ovl_tokens = {
 	{OPT_ERR,			NULL}
 };
 
+static char *ovl_next_opt(char **s)
+{
+	char *sbegin = *s;
+	char *p;
+
+	if (sbegin == NULL)
+		return NULL;
+
+	for (p = sbegin; *p; p++) {
+		if (*p == '\\') {
+			p++;
+			if (!*p)
+				break;
+		} else if (*p == ',') {
+			*p = '\0';
+			*s = p + 1;
+			return sbegin;
+		}
+	}
+	*s = NULL;
+	return sbegin;
+}
+
 static int ovl_parse_opt(char *opt, struct ovl_config *config)
 {
 	char *p;
 
-	while ((p = strsep(&opt, ",")) != NULL) {
+	while ((p = ovl_next_opt(&opt)) != NULL) {
 		int token;
 		substring_t args[MAX_OPT_ARGS];
 
@@ -554,15 +577,34 @@ out_dput:
 	goto out_unlock;
 }
 
+static void ovl_unescape(char *s)
+{
+	char *d = s;
+
+	for (;; s++, d++) {
+		if (*s == '\\')
+			s++;
+		*d = *s;
+		if (!*s)
+			break;
+	}
+}
+
 static int ovl_mount_dir(const char *name, struct path *path)
 {
 	int err;
+	char *tmp = kstrdup(name, GFP_KERNEL);
+
+	if (!tmp)
+		return -ENOMEM;
 
-	err = kern_path(name, LOOKUP_FOLLOW, path);
+	ovl_unescape(tmp);
+	err = kern_path(tmp, LOOKUP_FOLLOW, path);
 	if (err) {
-		pr_err("overlayfs: failed to resolve '%s': %i\n", name, err);
+		pr_err("overlayfs: failed to resolve '%s': %i\n", tmp, err);
 		err = -EINVAL;
 	}
+	kfree(tmp);
 	return err;
 }
 
-- 
cgit v1.2.3


From 71d509280f7e92eb60ae6b7c78c20afafff060c7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 20 Nov 2014 16:40:01 +0100
Subject: ovl: use lockless_dereference() for upperdentry

Don't open code lockless_dereference() in ovl_upperdentry_dereference().

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/overlayfs/super.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index eee7a62e1c0e..f16d318b71f8 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -84,12 +84,7 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
 
 static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
 {
-	struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry);
-	/*
-	 * Make sure to order reads to upperdentry wrt ovl_dentry_update()
-	 */
-	smp_read_barrier_depends();
-	return upperdentry;
+	return lockless_dereference(oe->__upperdentry);
 }
 
 void ovl_path_upper(struct dentry *dentry, struct path *path)
-- 
cgit v1.2.3


From c9f00fdb9ab3999cb2fb582ad82a5db9e70c82f5 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 20 Nov 2014 16:40:01 +0100
Subject: ovl: pass dentry into ovl_dir_read_merged()

Pass dentry into ovl_dir_read_merged() insted of upperpath and lowerpath.
This cleans up callers and paves the way for multi-layer directory reads.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/overlayfs/readdir.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 2a7ef4f8e2a6..7299e962f334 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -274,11 +274,11 @@ static int ovl_dir_mark_whiteouts(struct dentry *dir,
 	return 0;
 }
 
-static inline int ovl_dir_read_merged(struct path *upperpath,
-				      struct path *lowerpath,
-				      struct list_head *list)
+static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
 {
 	int err;
+	struct path lowerpath;
+	struct path upperpath;
 	struct ovl_readdir_data rdd = {
 		.ctx.actor = ovl_fill_merge,
 		.list = list,
@@ -286,25 +286,28 @@ static inline int ovl_dir_read_merged(struct path *upperpath,
 		.is_merge = false,
 	};
 
-	if (upperpath->dentry) {
-		err = ovl_dir_read(upperpath, &rdd);
+	ovl_path_lower(dentry, &lowerpath);
+	ovl_path_upper(dentry, &upperpath);
+
+	if (upperpath.dentry) {
+		err = ovl_dir_read(&upperpath, &rdd);
 		if (err)
 			goto out;
 
-		if (lowerpath->dentry) {
-			err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd);
+		if (lowerpath.dentry) {
+			err = ovl_dir_mark_whiteouts(upperpath.dentry, &rdd);
 			if (err)
 				goto out;
 		}
 	}
-	if (lowerpath->dentry) {
+	if (lowerpath.dentry) {
 		/*
 		 * Insert lowerpath entries before upperpath ones, this allows
 		 * offsets to be reasonably constant
 		 */
 		list_add(&rdd.middle, rdd.list);
 		rdd.is_merge = true;
-		err = ovl_dir_read(lowerpath, &rdd);
+		err = ovl_dir_read(&lowerpath, &rdd);
 		list_del(&rdd.middle);
 	}
 out:
@@ -329,8 +332,6 @@ static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
 static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
 {
 	int res;
-	struct path lowerpath;
-	struct path upperpath;
 	struct ovl_dir_cache *cache;
 
 	cache = ovl_dir_cache(dentry);
@@ -347,10 +348,7 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
 	cache->refcount = 1;
 	INIT_LIST_HEAD(&cache->entries);
 
-	ovl_path_lower(dentry, &lowerpath);
-	ovl_path_upper(dentry, &upperpath);
-
-	res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries);
+	res = ovl_dir_read_merged(dentry, &cache->entries);
 	if (res) {
 		ovl_cache_free(&cache->entries);
 		kfree(cache);
@@ -538,14 +536,9 @@ const struct file_operations ovl_dir_operations = {
 int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
 {
 	int err;
-	struct path lowerpath;
-	struct path upperpath;
 	struct ovl_cache_entry *p;
 
-	ovl_path_upper(dentry, &upperpath);
-	ovl_path_lower(dentry, &lowerpath);
-
-	err = ovl_dir_read_merged(&upperpath, &lowerpath, list);
+	err = ovl_dir_read_merged(dentry, list);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 7676895f4736421ebafc48de5078e25ea69e88ee Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Thu, 20 Nov 2014 16:40:02 +0100
Subject: ovl: ovl_dir_fsync() cleanup

Check against !OVL_PATH_LOWER instead of OVL_PATH_MERGE.  For a copied up
directory the two are currently equivalent.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/overlayfs/readdir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 7299e962f334..ab1e3dcbed95 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -450,10 +450,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 	/*
 	 * Need to check if we started out being a lower dir, but got copied up
 	 */
-	if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) {
+	if (!od->is_upper && ovl_path_type(dentry) != OVL_PATH_LOWER) {
 		struct inode *inode = file_inode(file);
 
-		realfile =lockless_dereference(od->upperfile);
+		realfile = lockless_dereference(od->upperfile);
 		if (!realfile) {
 			struct path upperpath;
 
-- 
cgit v1.2.3