From 8af3dcd3c89aef10375bdd79d5f336b22b57487c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 23 Sep 2014 15:57:59 +1000
Subject: xfs: xlog_cil_force_lsn doesn't always wait correctly

When running a tight mount/unmount loop on an older kernel, RedHat
QE found that unmount would occasionally hang in
xfs_buf_unpin_wait() on the superblock buffer. Tracing and other
debug work by Eric Sandeen indicated that it was hanging on the
writing of the superblock during unmount immediately after logging
the superblock counters in a synchronous transaction. Further debug
indicated that the synchronous transaction was not waiting for
completion correctly, and we narrowed it down to
xlog_cil_force_lsn() returning NULLCOMMITLSN and hence not pushing
the transaction in the iclog buffer to disk correctly.

While this unmount superblock write code is now very different in
mainline kernels, the xlog_cil_force_lsn() code is identical, and it
was bisected to the backport of commit f876e44 ("xfs: always do log
forces via the workqueue"). This commit made the CIL push
asynchronous for log forces and hence exposed a race condition that
couldn't occur on a synchronous push.

Essentially, the xlog_cil_force_lsn() relied implicitly on the fact
that the sequence push would be complete by the time
xlog_cil_push_now() returned, resulting in the context being pushed
being in the committing list. When it was made asynchronous, it was
recognised that there was a race condition in detecting whether an
asynchronous push has started or not and code was added to handle
it.

Unfortunately, the fix was not quite right and left a race condition
where it it would detect an empty CIL while a push was in progress
before the context had been added to the committing list. This was
incorrectly seen as a "nothing to do" condition and so would tell
xfs_log_force_lsn() that there is nothing to wait for, and hence it
would push the iclogbufs in memory.

The fix is simple, but explaining the logic and the race condition
is a lot more complex. The fix is to add the context to the
committing list before we start emptying the CIL. This allows us to
detect the difference between an empty "do nothing" push and a push
that has not started by adding a discrete "emptying the CIL" state
to avoid the transient, incorrect "empty" condition that the
(unchanged) waiting code was seeing.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_log_cil.c | 47 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 9 deletions(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index f6b79e5325dd..f506c457011e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -463,12 +463,40 @@ xlog_cil_push(
 		spin_unlock(&cil->xc_push_lock);
 		goto out_skip;
 	}
-	spin_unlock(&cil->xc_push_lock);
 
 
 	/* check for a previously pushed seqeunce */
-	if (push_seq < cil->xc_ctx->sequence)
+	if (push_seq < cil->xc_ctx->sequence) {
+		spin_unlock(&cil->xc_push_lock);
 		goto out_skip;
+	}
+
+	/*
+	 * We are now going to push this context, so add it to the committing
+	 * list before we do anything else. This ensures that anyone waiting on
+	 * this push can easily detect the difference between a "push in
+	 * progress" and "CIL is empty, nothing to do".
+	 *
+	 * IOWs, a wait loop can now check for:
+	 *	the current sequence not being found on the committing list;
+	 *	an empty CIL; and
+	 *	an unchanged sequence number
+	 * to detect a push that had nothing to do and therefore does not need
+	 * waiting on. If the CIL is not empty, we get put on the committing
+	 * list before emptying the CIL and bumping the sequence number. Hence
+	 * an empty CIL and an unchanged sequence number means we jumped out
+	 * above after doing nothing.
+	 *
+	 * Hence the waiter will either find the commit sequence on the
+	 * committing list or the sequence number will be unchanged and the CIL
+	 * still dirty. In that latter case, the push has not yet started, and
+	 * so the waiter will have to continue trying to check the CIL
+	 * committing list until it is found. In extreme cases of delay, the
+	 * sequence may fully commit between the attempts the wait makes to wait
+	 * on the commit sequence.
+	 */
+	list_add(&ctx->committing, &cil->xc_committing);
+	spin_unlock(&cil->xc_push_lock);
 
 	/*
 	 * pull all the log vectors off the items in the CIL, and
@@ -532,7 +560,6 @@ xlog_cil_push(
 	 */
 	spin_lock(&cil->xc_push_lock);
 	cil->xc_current_sequence = new_ctx->sequence;
-	list_add(&ctx->committing, &cil->xc_committing);
 	spin_unlock(&cil->xc_push_lock);
 	up_write(&cil->xc_ctx_lock);
 
@@ -855,13 +882,15 @@ restart:
 	 * Hence by the time we have got here it our sequence may not have been
 	 * pushed yet. This is true if the current sequence still matches the
 	 * push sequence after the above wait loop and the CIL still contains
-	 * dirty objects.
+	 * dirty objects. This is guaranteed by the push code first adding the
+	 * context to the committing list before emptying the CIL.
 	 *
-	 * When the push occurs, it will empty the CIL and atomically increment
-	 * the currect sequence past the push sequence and move it into the
-	 * committing list. Of course, if the CIL is clean at the time of the
-	 * push, it won't have pushed the CIL at all, so in that case we should
-	 * try the push for this sequence again from the start just in case.
+	 * Hence if we don't find the context in the committing list and the
+	 * current sequence number is unchanged then the CIL contents are
+	 * significant.  If the CIL is empty, if means there was nothing to push
+	 * and that means there is nothing to wait for. If the CIL is not empty,
+	 * it means we haven't yet started the push, because if it had started
+	 * we would have found the context on the committing list.
 	 */
 	if (sequence == cil->xc_current_sequence &&
 	    !list_empty(&cil->xc_cil)) {
-- 
cgit v1.2.3


From fb040131561a6b34cefb92cdf598218104abeda0 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 23 Sep 2014 16:05:32 +1000
Subject: xfs: don't ASSERT on corrupt ftype

xfs_dir3_data_get_ftype() gets the file type off disk, but ASSERTs
if it's invalid:

     ASSERT(type < XFS_DIR3_FT_MAX);

We shouldn't ASSERT on bad values read from disk.  V3 dirs are
CRC-protected, but V2 dirs + ftype are not.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/libxfs/xfs_da_format.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index c9aee52a37e2..7e42fdfd2f1d 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -270,7 +270,6 @@ xfs_dir3_data_get_ftype(
 {
 	__uint8_t	ftype = dep->name[dep->namelen];
 
-	ASSERT(ftype < XFS_DIR3_FT_MAX);
 	if (ftype >= XFS_DIR3_FT_MAX)
 		return XFS_DIR3_FT_UNKNOWN;
 	return ftype;
-- 
cgit v1.2.3


From e3cf17962a757e59fed2cbcbda6373c1b35a35dd Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Tue, 23 Sep 2014 16:05:55 +1000
Subject: xfs: remove second xfs_quota.h inclusion in xfs_icache.c

xfs_quota.h was included twice.

Signed-off-by: Fabian Frederick <fabf@skynet.be>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_icache.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 981b2cf51985..b45f7b27b5df 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,7 +33,6 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_bmap_util.h"
-#include "xfs_quota.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 
-- 
cgit v1.2.3


From ea95961df714f7fc446aa4bedfc61510ed1b59cc Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Tue, 23 Sep 2014 16:11:43 +1000
Subject: xfs: xfs_rtget_summary can be static

Fix sparse warning introduced by commit afabfd3 ("xfs: combine
xfs_rtmodify_summary and xfs_rtget_summary").

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_rtalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d1160cce5dad..d45aebe04dde 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -46,7 +46,7 @@
  * Keeps track of a current summary block, so we don't keep reading
  * it from the buffer cache.
  */
-int
+static int
 xfs_rtget_summary(
 	xfs_mount_t	*mp,		/* file system mount structure */
 	xfs_trans_t	*tp,		/* transaction pointer */
-- 
cgit v1.2.3


From 02cc18764c753befcdc163d1bc668a6599a54585 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 23 Sep 2014 16:15:45 +1000
Subject: xfs: xfs_buf_write_fail_rl_state can be static

Fix sparse warning introduced by commit ac8809f9 ("xfs: abort
metadata writeback on permanent errors").

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_buf_item.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 76007deed31f..30fa5db9aea8 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -501,7 +501,7 @@ xfs_buf_item_unpin(
  * buffer being bad..
  */
 
-DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
 
 STATIC uint
 xfs_buf_item_push(
-- 
cgit v1.2.3


From 7abbb8f928e5b7cea1edd077131b2ace665c6712 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 23 Sep 2014 16:20:11 +1000
Subject: xfs: xfs_swap_extent_flush can be static

Fix sparse warning introduced by commit 4ef897a ("xfs: flush both
inodes in xfs_swap_extents").

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_bmap_util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d39a35..1cb345e7c181 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1618,7 +1618,7 @@ xfs_swap_extents_check_format(
 	return 0;
 }
 
-int
+static int
 xfs_swap_extent_flush(
 	struct xfs_inode	*ip)
 {
-- 
cgit v1.2.3


From 2ebff7bbd785c86e12956388b9e6f6bb8ea5d21e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 23 Sep 2014 22:55:00 +1000
Subject: xfs: flush entire last page of old EOF on truncate up

On a sub-page sized filesystem, truncating a mapped region down
leaves us in a world of hurt. We truncate the pagecache, zeroing the
newly unused tail, then punch blocks out from under the page. If we
then truncate the file back up immediately, we expose that unmapped
hole to a dirty page mapped into the user application, and that's
where it all goes wrong.

In truncating the page cache, we avoid unmapping the tail page of
the cache because it still contains valid data. The problem is that
it also contains a hole after the truncate, but nobody told the mm
subsystem that. Therefore, if the page is dirty before the truncate,
we'll never get a .page_mkwrite callout after we extend the file and
the application writes data into the hole on the page.  Hence when
we come to writing that region of the page, it has no blocks and no
delayed allocation reservation and hence we toss the data away.

This patch adds code to the truncate up case to solve it, by
ensuring the partial page at the old EOF is always cleaned after we
do any zeroing and move the EOF upwards. We can't actually serialise
the page writeback and truncate against page faults (yes, that
problem AGAIN) so this is really just a best effort and assumes it
is extremely unlikely that someone is concurrently writing to the
page at the EOF while extending the file.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/xfs/xfs_iops.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'fs/xfs')

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 72129493e9d3..ec6dcdc181ee 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -849,6 +849,36 @@ xfs_setattr_size(
 		return error;
 	truncate_setsize(inode, newsize);
 
+	/*
+	 * The "we can't serialise against page faults" pain gets worse.
+	 *
+	 * If the file is mapped then we have to clean the page at the old EOF
+	 * when extending the file. Extending the file can expose changes the
+	 * underlying page mapping (e.g. from beyond EOF to a hole or
+	 * unwritten), and so on the next attempt to write to that page we need
+	 * to remap it for write. i.e. we need .page_mkwrite() to be called.
+	 * Hence we need to clean the page to clean the pte and so a new write
+	 * fault will be triggered appropriately.
+	 *
+	 * If we do it before we change the inode size, then we can race with a
+	 * page fault that maps the page with exactly the same problem. If we do
+	 * it after we change the file size, then a new page fault can come in
+	 * and allocate space before we've run the rest of the truncate
+	 * transaction. That's kinda grotesque, but it's better than have data
+	 * over a hole, and so that's the lesser evil that has been chosen here.
+	 *
+	 * The real solution, however, is to have some mechanism for locking out
+	 * page faults while a truncate is in progress.
+	 */
+	if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
+		error = filemap_write_and_wait_range(
+				VFS_I(ip)->i_mapping,
+				round_down(oldsize, PAGE_CACHE_SIZE),
+				round_up(oldsize, PAGE_CACHE_SIZE) - 1);
+		if (error)
+			return error;
+	}
+
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
 	if (error)
-- 
cgit v1.2.3