From a6bbce54efa9145dbcf3029c885549f7ebc40a3b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 29 Oct 2014 08:22:18 +1100 Subject: xfs: bulkstat doesn't release AGI buffer on error The recent refactoring of the bulkstat code left a small landmine in the code. If a inobt read fails, then the tree walk is aborted and returns without releasing the AGI buffer or freeing the cursor. This can lead to a subsequent bulkstat call hanging trying to grab the AGI buffer again. cc: Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f1deb961a296..ef8ea0589780 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -427,7 +427,7 @@ xfs_bulkstat( error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r); if (error) - break; + goto del_cursor; if (icount) { irbp->ir_startino = r.ir_startino; irbp->ir_freecount = r.ir_freecount; @@ -442,7 +442,7 @@ xfs_bulkstat( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); } if (error) - break; + goto del_cursor; /* * Loop through inode btree records in this ag, @@ -454,7 +454,7 @@ xfs_bulkstat( error = xfs_inobt_get_rec(cur, &r, &i); if (error || i == 0) { end_of_ag = 1; - break; + goto del_cursor; } /* @@ -476,13 +476,17 @@ xfs_bulkstat( error = xfs_btree_increment(cur, 0, &tmp); cond_resched(); } + /* - * Drop the btree buffers and the agi buffer. - * We can't hold any of the locks these represent - * when calling iget. + * Drop the btree buffers and the agi buffer as we can't hold any + * of the locks these represent when calling iget. If there is a + * pending error, then we are done. */ +del_cursor: xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_buf_relse(agbp); + if (error) + break; /* * Now format all the good inodes into the user's buffer. */ -- cgit v1.2.3 From 7a19dee116c8fae7ba7a778043c245194289f5a2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Oct 2014 10:34:52 +1100 Subject: xfs: Check error during inode btree iteration in xfs_bulkstat() xfs_bulkstat() doesn't check error return from xfs_btree_increment(). In case of specific fs corruption that could result in xfs_bulkstat() entering an infinite loop because we would be looping over the same chunk over and over again. Fix the problem by checking the return value and terminating the loop properly. Coverity-id: 1231338 cc: Signed-off-by: Jan Kara Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ef8ea0589780..7765ff743e91 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -474,6 +474,10 @@ xfs_bulkstat( */ agino = r.ir_startino + XFS_INODES_PER_CHUNK; error = xfs_btree_increment(cur, 0, &tmp); + if (error) { + end_of_ag = 1; + goto del_cursor; + } cond_resched(); } -- cgit v1.2.3 From f55fefd1a5a339b1bd08c120b93312d6eb64a9fb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Oct 2014 10:35:00 +1100 Subject: mm: Remove false WARN_ON from pagecache_isize_extended() The WARN_ON checking whether i_mutex is held in pagecache_isize_extended() was wrong because some filesystems (e.g. XFS) use different locks for serialization of truncates / writes. So just remove the check. Signed-off-by: Jan Kara Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- mm/truncate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/truncate.c b/mm/truncate.c index 261eaf6e5a19..c646084e5eec 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -755,7 +755,6 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) struct page *page; pgoff_t index; - WARN_ON(!mutex_is_locked(&inode->i_mutex)); WARN_ON(to > inode->i_size); if (from >= to || bsize == PAGE_CACHE_SIZE) -- cgit v1.2.3 From 5d11fb4b9a1d90983452c029b5e1377af78fda49 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 30 Oct 2014 10:35:11 +1100 Subject: xfs: rework zero range to prevent invalid i_size updates The zero range operation is analogous to fallocate with the exception of converting the range to zeroes. E.g., it attempts to allocate zeroed blocks over the range specified by the caller. The XFS implementation kills all delalloc blocks currently over the aligned range, converts the range to allocated zero blocks (unwritten extents) and handles the partial pages at the ends of the range by sending writes through the pagecache. The current implementation suffers from several problems associated with inode size. If the aligned range covers an extending I/O, said I/O is discarded and an inode size update from a previous write never makes it to disk. Further, if an unaligned zero range extends beyond eof, the page write induced for the partial end page can itself increase the inode size, even if the zero range request is not supposed to update i_size (via KEEP_SIZE, similar to an fallocate beyond EOF). The latter behavior not only incorrectly increases the inode size, but can lead to stray delalloc blocks on the inode. Typically, post-eof preallocation blocks are either truncated on release or inode eviction or explicitly written to by xfs_zero_eof() on natural file size extension. If the inode size increases due to zero range, however, associated blocks leak into the address space having never been converted or mapped to pagecache pages. A direct I/O to such an uncovered range cannot convert the extent via writeback and will BUG(). For example: $ xfs_io -fc "pwrite 0 128k" -c "fzero -k 1m 54321" ... $ xfs_io -d -c "pread 128k 128k" If the entire delalloc extent happens to not have page coverage whatsoever (e.g., delalloc conversion couldn't find a large enough free space extent), even a full file writeback won't convert what's left of the extent and we'll assert on inode eviction. Rework xfs_zero_file_space() to avoid buffered I/O for partial pages. Use the existing hole punch and prealloc mechanisms as primitives for zero range. This implementation is not efficient nor ideal as we writeback dirty data over the range and remove existing extents rather than convert to unwrittern. The former writeback, however, is currently the only mechanism available to ensure consistency between pagecache and extent state. Even a pagecache truncate/delalloc punch prior to hole punch has lead to inconsistencies due to racing with writeback. This provides a consistent, correct implementation of zero range that survives fsstress/fsx testing without assert failures. The implementation can be optimized from this point forward once the fundamental issue of pagecache and delalloc extent state consistency is addressed. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 72 ++++++++++++++------------------------------------ 1 file changed, 20 insertions(+), 52 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 92e8f99a5857..281002689d64 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1338,7 +1338,10 @@ xfs_free_file_space( goto out; } - +/* + * Preallocate and zero a range of a file. This mechanism has the allocation + * semantics of fallocate and in addition converts data in the range to zeroes. + */ int xfs_zero_file_space( struct xfs_inode *ip, @@ -1346,65 +1349,30 @@ xfs_zero_file_space( xfs_off_t len) { struct xfs_mount *mp = ip->i_mount; - uint granularity; - xfs_off_t start_boundary; - xfs_off_t end_boundary; + uint blksize; int error; trace_xfs_zero_file_space(ip); - granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + blksize = 1 << mp->m_sb.sb_blocklog; /* - * Round the range of extents we are going to convert inwards. If the - * offset is aligned, then it doesn't get changed so we zero from the - * start of the block offset points to. + * Punch a hole and prealloc the range. We use hole punch rather than + * unwritten extent conversion for two reasons: + * + * 1.) Hole punch handles partial block zeroing for us. + * + * 2.) If prealloc returns ENOSPC, the file range is still zero-valued + * by virtue of the hole punch. */ - start_boundary = round_up(offset, granularity); - end_boundary = round_down(offset + len, granularity); - - ASSERT(start_boundary >= offset); - ASSERT(end_boundary <= offset + len); - - if (start_boundary < end_boundary - 1) { - /* - * Writeback the range to ensure any inode size updates due to - * appending writes make it to disk (otherwise we could just - * punch out the delalloc blocks). - */ - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - start_boundary, end_boundary - 1); - if (error) - goto out; - truncate_pagecache_range(VFS_I(ip), start_boundary, - end_boundary - 1); - - /* convert the blocks */ - error = xfs_alloc_file_space(ip, start_boundary, - end_boundary - start_boundary - 1, - XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT); - if (error) - goto out; - - /* We've handled the interior of the range, now for the edges */ - if (start_boundary != offset) { - error = xfs_iozero(ip, offset, start_boundary - offset); - if (error) - goto out; - } - - if (end_boundary != offset + len) - error = xfs_iozero(ip, end_boundary, - offset + len - end_boundary); - - } else { - /* - * It's either a sub-granularity range or the range spanned lies - * partially across two adjacent blocks. - */ - error = xfs_iozero(ip, offset, len); - } + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out; + error = xfs_alloc_file_space(ip, round_down(offset, blksize), + round_up(offset + len, blksize) - + round_down(offset, blksize), + XFS_BMAPI_PREALLOC); out: return error; -- cgit v1.2.3 From 77783d06427293b2d711c45cfd4abc6494a1af9c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 7 Nov 2014 08:29:25 +1100 Subject: mm: Fix comment before truncate_setsize() XFS doesn't always hold i_mutex when calling truncate_setsize() and it uses a different lock to serialize truncates and writes. So fix the comment before truncate_setsize(). Reported-by: Jan Beulich Signed-off-by: Jan Kara Signed-off-by: Dave Chinner --- mm/truncate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index c646084e5eec..f1e4d6052369 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -715,8 +715,9 @@ EXPORT_SYMBOL(truncate_pagecache); * necessary) to @newsize. It will be typically be called from the filesystem's * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and before all filesystem specific - * block truncation has been performed. + * Must be called with a lock serializing truncates and writes (generally + * i_mutex but e.g. xfs uses a different lock) and before all filesystem + * specific block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { -- cgit v1.2.3 From afa947cb52a8e73fe71915a0b0af6fcf98dfbe1a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:29:57 +1100 Subject: xfs: bulkstat btree walk doesn't terminate The bulkstat code has several different ways of detecting the end of an AG when doing a walk. They are not consistently detected, and the code that checks for the end of AG conditions is not consistently coded. Hence the are conditions where the walk code can get stuck in an endless loop making no progress and not triggering any termination conditions. Convert all the "tmp/i" status return codes from btree operations to a common name (stat) and apply end-of-ag detection to these operations consistently. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7765ff743e91..16737cbbee17 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -356,7 +356,6 @@ xfs_bulkstat( int end_of_ag; /* set if we've seen the ag end */ int error; /* error code */ int fmterror;/* bulkstat formatter result */ - int i; /* loop index */ int icount; /* count of inodes good in irbuf */ size_t irbsize; /* size of irec buffer in bytes */ xfs_ino_t ino; /* inode number (filesystem) */ @@ -366,11 +365,11 @@ xfs_bulkstat( xfs_ino_t lastino; /* last inode number returned */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ - int tmp; /* result value from btree calls */ int ubcount; /* size of user's buffer */ int ubleft; /* bytes left in user's buffer */ char __user *ubufp; /* pointer into user's buffer */ int ubelem; /* spaces used in user's buffer */ + int stat; /* * Get the last inode value, see if there's nothing to do. @@ -436,13 +435,15 @@ xfs_bulkstat( agino = r.ir_startino + XFS_INODES_PER_CHUNK; } /* Increment to the next record */ - error = xfs_btree_increment(cur, 0, &tmp); + error = xfs_btree_increment(cur, 0, &stat); } else { /* Start of ag. Lookup the first inode chunk */ - error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat); } - if (error) + if (error || stat == 0) { + end_of_ag = 1; goto del_cursor; + } /* * Loop through inode btree records in this ag, @@ -451,8 +452,8 @@ xfs_bulkstat( while (irbp < irbufend && icount < ubcount) { struct xfs_inobt_rec_incore r; - error = xfs_inobt_get_rec(cur, &r, &i); - if (error || i == 0) { + error = xfs_inobt_get_rec(cur, &r, &stat); + if (error || stat == 0) { end_of_ag = 1; goto del_cursor; } @@ -473,8 +474,8 @@ xfs_bulkstat( * Set agino to after this chunk and bump the cursor. */ agino = r.ir_startino + XFS_INODES_PER_CHUNK; - error = xfs_btree_increment(cur, 0, &tmp); - if (error) { + error = xfs_btree_increment(cur, 0, &stat); + if (error || stat == 0) { end_of_ag = 1; goto del_cursor; } -- cgit v1.2.3 From bf4a5af20d25ecc8876978ad34b8db83b4235f3c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:30:30 +1100 Subject: xfs: bulkstat chunk formatting cursor is broken The xfs_bulkstat_agichunk formatting cursor takes buffer values from the main loop and passes them via the structure to the chunk formatter, and the writes the changed values back into the main loop local variables. Unfortunately, this complex dance is full of corner cases that aren't handled correctly. The biggest problem is that it is double handling the information in both the main loop and the chunk formatting function, leading to inconsistent updates and endless loops where progress is not made. To fix this, push the struct xfs_bulkstat_agichunk outwards to be the primary holder of user buffer information. this removes the double handling in the main loop. Also, pass the last inode processed by the chunk formatter as a separate parameter as it purely an output variable and is not related to the user buffer consumption cursor. Finally, the chunk formatting code is not shared by anyone, so make it local to xfs_itable.c. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 59 +++++++++++++++++++++++++---------------------------- fs/xfs/xfs_itable.h | 16 --------------- 2 files changed, 28 insertions(+), 47 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 16737cbbee17..50a3e5995dd9 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -262,20 +262,26 @@ xfs_bulkstat_grab_ichunk( #define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) +struct xfs_bulkstat_agichunk { + char __user **ac_ubuffer;/* pointer into user's buffer */ + int ac_ubleft; /* bytes left in user's buffer */ + int ac_ubelem; /* spaces used in user's buffer */ +}; + /* * Process inodes in chunk with a pointer to a formatter function * that will iget the inode and fill in the appropriate structure. */ -int +static int xfs_bulkstat_ag_ichunk( struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_inobt_rec_incore *irbp, bulkstat_one_pf formatter, size_t statstruct_size, - struct xfs_bulkstat_agichunk *acp) + struct xfs_bulkstat_agichunk *acp, + xfs_ino_t *lastino) { - xfs_ino_t lastino = acp->ac_lastino; char __user **ubufp = acp->ac_ubuffer; int ubleft = acp->ac_ubleft; int ubelem = acp->ac_ubelem; @@ -295,7 +301,7 @@ xfs_bulkstat_ag_ichunk( /* Skip if this inode is free */ if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { - lastino = ino; + *lastino = ino; continue; } @@ -313,7 +319,7 @@ xfs_bulkstat_ag_ichunk( ubleft = 0; break; } - lastino = ino; + *lastino = ino; continue; } if (fmterror == BULKSTAT_RV_GIVEUP) { @@ -325,10 +331,9 @@ xfs_bulkstat_ag_ichunk( *ubufp += ubused; ubleft -= ubused; ubelem++; - lastino = ino; + *lastino = ino; } - acp->ac_lastino = lastino; acp->ac_ubleft = ubleft; acp->ac_ubelem = ubelem; @@ -355,7 +360,6 @@ xfs_bulkstat( xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ int end_of_ag; /* set if we've seen the ag end */ int error; /* error code */ - int fmterror;/* bulkstat formatter result */ int icount; /* count of inodes good in irbuf */ size_t irbsize; /* size of irec buffer in bytes */ xfs_ino_t ino; /* inode number (filesystem) */ @@ -366,10 +370,8 @@ xfs_bulkstat( int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int ubcount; /* size of user's buffer */ - int ubleft; /* bytes left in user's buffer */ - char __user *ubufp; /* pointer into user's buffer */ - int ubelem; /* spaces used in user's buffer */ int stat; + struct xfs_bulkstat_agichunk ac; /* * Get the last inode value, see if there's nothing to do. @@ -386,11 +388,13 @@ xfs_bulkstat( } ubcount = *ubcountp; /* statstruct's */ - ubleft = ubcount * statstruct_size; /* bytes */ - *ubcountp = ubelem = 0; + ac.ac_ubuffer = &ubuffer; + ac.ac_ubleft = ubcount * statstruct_size; /* bytes */; + ac.ac_ubelem = 0; + + *ubcountp = 0; *done = 0; - fmterror = 0; - ubufp = ubuffer; + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); if (!irbuf) return -ENOMEM; @@ -402,7 +406,7 @@ xfs_bulkstat( * inode returned; 0 means start of the allocation group. */ rval = 0; - while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { + while (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft) && agno < mp->m_sb.sb_agcount) { cond_resched(); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); if (error) @@ -497,28 +501,21 @@ del_cursor: */ irbufend = irbp; for (irbp = irbuf; - irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { - struct xfs_bulkstat_agichunk ac; - - ac.ac_lastino = lastino; - ac.ac_ubuffer = &ubuffer; - ac.ac_ubleft = ubleft; - ac.ac_ubelem = ubelem; + irbp < irbufend && XFS_BULKSTAT_UBLEFT(ac.ac_ubleft); + irbp++) { error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, - formatter, statstruct_size, &ac); + formatter, statstruct_size, &ac, + &lastino); if (error) rval = error; - lastino = ac.ac_lastino; - ubleft = ac.ac_ubleft; - ubelem = ac.ac_ubelem; - cond_resched(); } + /* * Set up for the next loop iteration. */ - if (XFS_BULKSTAT_UBLEFT(ubleft)) { + if (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft)) { if (end_of_ag) { agno++; agino = 0; @@ -531,11 +528,11 @@ del_cursor: * Done, we're either out of filesystem or space to put the data. */ kmem_free(irbuf); - *ubcountp = ubelem; + *ubcountp = ac.ac_ubelem; /* * Found some inodes, return them now and return the error next time. */ - if (ubelem) + if (ac.ac_ubelem) rval = 0; if (agno >= mp->m_sb.sb_agcount) { /* diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index aaed08022eb9..6ea8b3912fa4 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, int *ubused, int *stat); -struct xfs_bulkstat_agichunk { - xfs_ino_t ac_lastino; /* last inode returned */ - char __user **ac_ubuffer;/* pointer into user's buffer */ - int ac_ubleft; /* bytes left in user's buffer */ - int ac_ubelem; /* spaces used in user's buffer */ -}; - -int -xfs_bulkstat_ag_ichunk( - struct xfs_mount *mp, - xfs_agnumber_t agno, - struct xfs_inobt_rec_incore *irbp, - bulkstat_one_pf formatter, - size_t statstruct_size, - struct xfs_bulkstat_agichunk *acp); - /* * Values for stat return value. */ -- cgit v1.2.3 From 2b831ac6bc87d3cbcbb1a8816827b6923403e461 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:30:58 +1100 Subject: xfs: bulkstat chunk-formatter has issues The loop construct has issues: - clustidx is completely unused, so remove it. - the loop tries to be smart by terminating when the "freecount" tells it that all inodes are free. Just drop it as in most cases we have to scan all inodes in the chunk anyway. - move the "user buffer left" condition check to the only point where we consume space int eh user buffer. - move the initialisation of agino out of the loop, leaving just a simple loop control logic using the clusteridx. Also, double handling of the user buffer variables leads to problems tracking the current state - use the cursor variables directly rather than keeping local copies and then having to update the cursor before returning. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 58 ++++++++++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 50a3e5995dd9..7ea2b113db1b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -283,59 +283,49 @@ xfs_bulkstat_ag_ichunk( xfs_ino_t *lastino) { char __user **ubufp = acp->ac_ubuffer; - int ubleft = acp->ac_ubleft; - int ubelem = acp->ac_ubelem; - int chunkidx, clustidx; + int chunkidx; int error = 0; xfs_agino_t agino; - for (agino = irbp->ir_startino, chunkidx = clustidx = 0; - XFS_BULKSTAT_UBLEFT(ubleft) && - irbp->ir_freecount < XFS_INODES_PER_CHUNK; - chunkidx++, clustidx++, agino++) { - int fmterror; /* bulkstat formatter result */ + agino = irbp->ir_startino; + for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; + chunkidx++, agino++) { + int fmterror; int ubused; xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino); - ASSERT(chunkidx < XFS_INODES_PER_CHUNK); - /* Skip if this inode is free */ if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { *lastino = ino; continue; } - /* - * Count used inodes as free so we can tell when the - * chunk is used up. - */ - irbp->ir_freecount++; - /* Get the inode and fill in a single buffer */ ubused = statstruct_size; - error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror); - if (fmterror == BULKSTAT_RV_NOTHING) { - if (error && error != -ENOENT && error != -EINVAL) { - ubleft = 0; - break; - } - *lastino = ino; - continue; - } - if (fmterror == BULKSTAT_RV_GIVEUP) { - ubleft = 0; + error = formatter(mp, ino, *ubufp, acp->ac_ubleft, + &ubused, &fmterror); + if (fmterror == BULKSTAT_RV_GIVEUP || + (error && error != -ENOENT && error != -EINVAL)) { + acp->ac_ubleft = 0; ASSERT(error); break; } - if (*ubufp) - *ubufp += ubused; - ubleft -= ubused; - ubelem++; + + /* be careful not to leak error if at end of chunk */ + if (fmterror == BULKSTAT_RV_NOTHING || error) { + *lastino = ino; + error = 0; + continue; + } + + *ubufp += ubused; + acp->ac_ubleft -= ubused; + acp->ac_ubelem++; *lastino = ino; - } - acp->ac_ubleft = ubleft; - acp->ac_ubelem = ubelem; + if (acp->ac_ubleft < statstruct_size) + break; + } return error; } -- cgit v1.2.3 From 6e57c542cb7e0e580eb53ae76a77875c7d92b4b1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:31:13 +1100 Subject: xfs: bulkstat main loop logic is a mess There are a bunch of variables tha tare more wildy scoped than they need to be, obfuscated user buffer checks and tortured "next inode" tracking. This all needs cleaning up to expose the real issues that need fixing. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 56 +++++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 7ea2b113db1b..acae3355ab22 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -348,30 +348,23 @@ xfs_bulkstat( xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ - int end_of_ag; /* set if we've seen the ag end */ - int error; /* error code */ - int icount; /* count of inodes good in irbuf */ size_t irbsize; /* size of irec buffer in bytes */ - xfs_ino_t ino; /* inode number (filesystem) */ - xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ - xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ xfs_ino_t lastino; /* last inode number returned */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int ubcount; /* size of user's buffer */ - int stat; struct xfs_bulkstat_agichunk ac; + int error = 0; /* * Get the last inode value, see if there's nothing to do. */ - ino = (xfs_ino_t)*lastinop; - lastino = ino; - agno = XFS_INO_TO_AGNO(mp, ino); - agino = XFS_INO_TO_AGINO(mp, ino); + lastino = *lastinop; + agno = XFS_INO_TO_AGNO(mp, lastino); + agino = XFS_INO_TO_AGINO(mp, lastino); if (agno >= mp->m_sb.sb_agcount || - ino != XFS_AGINO_TO_INO(mp, agno, agino)) { + lastino != XFS_AGINO_TO_INO(mp, agno, agino)) { *done = 1; *ubcountp = 0; return 0; @@ -396,8 +389,13 @@ xfs_bulkstat( * inode returned; 0 means start of the allocation group. */ rval = 0; - while (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft) && agno < mp->m_sb.sb_agcount) { - cond_resched(); + while (agno < mp->m_sb.sb_agcount) { + struct xfs_inobt_rec_incore *irbp = irbuf; + struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf; + bool end_of_ag = false; + int icount = 0; + int stat; + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); if (error) break; @@ -407,10 +405,6 @@ xfs_bulkstat( */ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO); - irbp = irbuf; - irbufend = irbuf + nirbuf; - end_of_ag = 0; - icount = 0; if (agino > 0) { /* * In the middle of an allocation group, we need to get @@ -435,7 +429,7 @@ xfs_bulkstat( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat); } if (error || stat == 0) { - end_of_ag = 1; + end_of_ag = true; goto del_cursor; } @@ -448,7 +442,7 @@ xfs_bulkstat( error = xfs_inobt_get_rec(cur, &r, &stat); if (error || stat == 0) { - end_of_ag = 1; + end_of_ag = true; goto del_cursor; } @@ -470,7 +464,7 @@ xfs_bulkstat( agino = r.ir_startino + XFS_INODES_PER_CHUNK; error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { - end_of_ag = 1; + end_of_ag = true; goto del_cursor; } cond_resched(); @@ -491,7 +485,7 @@ del_cursor: */ irbufend = irbp; for (irbp = irbuf; - irbp < irbufend && XFS_BULKSTAT_UBLEFT(ac.ac_ubleft); + irbp < irbufend && ac.ac_ubleft >= statstruct_size; irbp++) { error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, formatter, statstruct_size, &ac, @@ -502,17 +496,15 @@ del_cursor: cond_resched(); } - /* - * Set up for the next loop iteration. - */ - if (XFS_BULKSTAT_UBLEFT(ac.ac_ubleft)) { - if (end_of_ag) { - agno++; - agino = 0; - } else - agino = XFS_INO_TO_AGINO(mp, lastino); - } else + /* If we've run out of space, we are done */ + if (ac.ac_ubleft < statstruct_size) break; + + if (end_of_ag) { + agno++; + agino = 0; + } else + agino = XFS_INO_TO_AGINO(mp, lastino); } /* * Done, we're either out of filesystem or space to put the data. -- cgit v1.2.3 From febe3cbe38b0bc0a925906dc90e8d59048851f87 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:31:15 +1100 Subject: xfs: bulkstat error handling is broken The error propagation is a horror - xfs_bulkstat() returns a rval variable which is only set if there are formatter errors. Any sort of btree walk error or corruption will cause the bulkstat walk to terminate but will not pass an error back to userspace. Worse is the fact that formatter errors will also be ignored if any inodes were correctly formatted into the user buffer. Hence bulkstat can fail badly yet still report success to userspace. This causes significant issues with xfsdump not dumping everything in the filesystem yet reporting success. It's not until a restore fails that there is any indication that the dump was bad and tha bulkstat failed. This patch now triggers xfsdump to fail with bulkstat errors rather than silently missing files in the dump. This now causes bulkstat to fail when the lastino cookie does not fall inside an existing inode chunk. The pre-3.17 code tolerated that error by allowing the code to move to the next inode chunk as the agino target is guaranteed to fall into the next btree record. With the fixes up to this point in the series, xfsdump now passes on the troublesome filesystem image that exposes all these bugs. cc: Signed-off-by: Dave Chinner Reviewed-by: Brian Foster --- fs/xfs/xfs_itable.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index acae3355ab22..ff3f431671b9 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk( XFS_WANT_CORRUPTED_RETURN(stat == 1); /* Check if the record contains the inode in request */ - if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) - return -EINVAL; + if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { + *icount = 0; + return 0; + } idx = agino - irec->ir_startino + 1; if (idx < XFS_INODES_PER_CHUNK && @@ -352,7 +354,6 @@ xfs_bulkstat( xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_ino_t lastino; /* last inode number returned */ int nirbuf; /* size of irbuf */ - int rval; /* return value error code */ int ubcount; /* size of user's buffer */ struct xfs_bulkstat_agichunk ac; int error = 0; @@ -388,7 +389,6 @@ xfs_bulkstat( * Loop over the allocation groups, starting from the last * inode returned; 0 means start of the allocation group. */ - rval = 0; while (agno < mp->m_sb.sb_agcount) { struct xfs_inobt_rec_incore *irbp = irbuf; struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf; @@ -491,13 +491,16 @@ del_cursor: formatter, statstruct_size, &ac, &lastino); if (error) - rval = error; + break; cond_resched(); } - /* If we've run out of space, we are done */ - if (ac.ac_ubleft < statstruct_size) + /* + * If we've run out of space or had a formatting error, we + * are now done + */ + if (ac.ac_ubleft < statstruct_size || error) break; if (end_of_ag) { @@ -511,11 +514,17 @@ del_cursor: */ kmem_free(irbuf); *ubcountp = ac.ac_ubelem; + /* - * Found some inodes, return them now and return the error next time. + * We found some inodes, so clear the error status and return them. + * The lastino pointer will point directly at the inode that triggered + * any error that occurred, so on the next call the error will be + * triggered again and propagated to userspace as there will be no + * formatted inodes in the buffer. */ if (ac.ac_ubelem) - rval = 0; + error = 0; + if (agno >= mp->m_sb.sb_agcount) { /* * If we ran out of filesystem, mark lastino as off @@ -527,7 +536,7 @@ del_cursor: } else *lastinop = (xfs_ino_t)lastino; - return rval; + return error; } int -- cgit v1.2.3 From 002758992693ae63c04122603ea9261a0a58d728 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 7 Nov 2014 08:33:52 +1100 Subject: xfs: track bulkstat progress by agino The bulkstat main loop progress is tracked by the "lastino" variable, which is a full 64 bit inode. However, the loop actually works on agno/agino pairs, and so there's a significant disconnect between the rest of the loop and the main cursor. Convert this to use the agino, and pass the agino into the chunk formatting function and convert it too. This gets rid of the inconsistency in the loop processing, and finally makes it simple for us to skip inodes at any point in the loop simply by incrementing the agino cursor. cc: # 3.17 Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_itable.c | 71 +++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index ff3f431671b9..894924a5129b 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -282,30 +282,31 @@ xfs_bulkstat_ag_ichunk( bulkstat_one_pf formatter, size_t statstruct_size, struct xfs_bulkstat_agichunk *acp, - xfs_ino_t *lastino) + xfs_agino_t *last_agino) { char __user **ubufp = acp->ac_ubuffer; int chunkidx; int error = 0; - xfs_agino_t agino; + xfs_agino_t agino = irbp->ir_startino; - agino = irbp->ir_startino; for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; chunkidx++, agino++) { int fmterror; int ubused; - xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino); + + /* inode won't fit in buffer, we are done */ + if (acp->ac_ubleft < statstruct_size) + break; /* Skip if this inode is free */ - if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { - *lastino = ino; + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) continue; - } /* Get the inode and fill in a single buffer */ ubused = statstruct_size; - error = formatter(mp, ino, *ubufp, acp->ac_ubleft, - &ubused, &fmterror); + error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino), + *ubufp, acp->ac_ubleft, &ubused, &fmterror); + if (fmterror == BULKSTAT_RV_GIVEUP || (error && error != -ENOENT && error != -EINVAL)) { acp->ac_ubleft = 0; @@ -315,7 +316,6 @@ xfs_bulkstat_ag_ichunk( /* be careful not to leak error if at end of chunk */ if (fmterror == BULKSTAT_RV_NOTHING || error) { - *lastino = ino; error = 0; continue; } @@ -323,12 +323,18 @@ xfs_bulkstat_ag_ichunk( *ubufp += ubused; acp->ac_ubleft -= ubused; acp->ac_ubelem++; - *lastino = ino; - - if (acp->ac_ubleft < statstruct_size) - break; } + /* + * Post-update *last_agino. At this point, agino will always point one + * inode past the last inode we processed successfully. Hence we + * substract that inode when setting the *last_agino cursor so that we + * return the correct cookie to userspace. On the next bulkstat call, + * the inode under the lastino cookie will be skipped as we have already + * processed it here. + */ + *last_agino = agino - 1; + return error; } @@ -352,7 +358,6 @@ xfs_bulkstat( xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ size_t irbsize; /* size of irec buffer in bytes */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ - xfs_ino_t lastino; /* last inode number returned */ int nirbuf; /* size of irbuf */ int ubcount; /* size of user's buffer */ struct xfs_bulkstat_agichunk ac; @@ -361,11 +366,10 @@ xfs_bulkstat( /* * Get the last inode value, see if there's nothing to do. */ - lastino = *lastinop; - agno = XFS_INO_TO_AGNO(mp, lastino); - agino = XFS_INO_TO_AGINO(mp, lastino); + agno = XFS_INO_TO_AGNO(mp, *lastinop); + agino = XFS_INO_TO_AGINO(mp, *lastinop); if (agno >= mp->m_sb.sb_agcount || - lastino != XFS_AGINO_TO_INO(mp, agno, agino)) { + *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) { *done = 1; *ubcountp = 0; return 0; @@ -420,7 +424,6 @@ xfs_bulkstat( irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; - agino = r.ir_startino + XFS_INODES_PER_CHUNK; } /* Increment to the next record */ error = xfs_btree_increment(cur, 0, &stat); @@ -458,10 +461,6 @@ xfs_bulkstat( irbp++; icount += XFS_INODES_PER_CHUNK - r.ir_freecount; } - /* - * Set agino to after this chunk and bump the cursor. - */ - agino = r.ir_startino + XFS_INODES_PER_CHUNK; error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { end_of_ag = true; @@ -481,7 +480,9 @@ del_cursor: if (error) break; /* - * Now format all the good inodes into the user's buffer. + * Now format all the good inodes into the user's buffer. The + * call to xfs_bulkstat_ag_ichunk() sets up the agino pointer + * for the next loop iteration. */ irbufend = irbp; for (irbp = irbuf; @@ -489,7 +490,7 @@ del_cursor: irbp++) { error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, formatter, statstruct_size, &ac, - &lastino); + &agino); if (error) break; @@ -506,8 +507,7 @@ del_cursor: if (end_of_ag) { agno++; agino = 0; - } else - agino = XFS_INO_TO_AGINO(mp, lastino); + } } /* * Done, we're either out of filesystem or space to put the data. @@ -525,16 +525,13 @@ del_cursor: if (ac.ac_ubelem) error = 0; - if (agno >= mp->m_sb.sb_agcount) { - /* - * If we ran out of filesystem, mark lastino as off - * the end of the filesystem, so the next call - * will return immediately. - */ - *lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0); + /* + * If we ran out of filesystem, lastino will point off the end of + * the filesystem so the next call will return immediately. + */ + *lastinop = XFS_AGINO_TO_INO(mp, agno, agino); + if (agno >= mp->m_sb.sb_agcount) *done = 1; - } else - *lastinop = (xfs_ino_t)lastino; return error; } -- cgit v1.2.3