From ce576f1c5688caade085ae9bba729e886b7ab1d9 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 20 May 2014 08:15:57 +1000 Subject: xfs: remove XFS_TRANS_RESERVE in collapse range There is no need to dip into reserve pool. Reserve pool is used for much more important things. And xfs_trans_reserve will never return ENOSPC because punch hole is already done. If we get ENOSPC, collapse range will be simply failed. Cc: Brian Foster Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 296160b8e78c..057f671811d6 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1519,7 +1519,6 @@ xfs_collapse_file_space( while (!error && !done) { tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - tp->t_flags |= XFS_TRANS_RESERVE; /* * We would need to reserve permanent block for transaction. * This will come into picture when after shifting extent into @@ -1529,7 +1528,6 @@ xfs_collapse_file_space( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); break; } -- cgit v1.2.3 From 110dc24ad2ae4e9b94b08632fe1eb2fcdff83045 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 20 May 2014 08:18:09 +1000 Subject: xfs: log vector rounding leaks log space The addition of direct formatting of log items into the CIL linear buffer added alignment restrictions that the start of each vector needed to be 64 bit aligned. Hence padding was added in xlog_finish_iovec() to round up the vector length to ensure the next vector started with the correct alignment. This adds a small number of bytes to the size of the linear buffer that is otherwise unused. The issue is that we then use the linear buffer size to determine the log space used by the log item, and this includes the unused space. Hence when we account for space used by the log item, it's more than is actually written into the iclogs, and hence we slowly leak this space. This results on log hangs when reserving space, with threads getting stuck with these stack traces: Call Trace: [] schedule+0x29/0x70 [] xlog_grant_head_wait+0xa2/0x1a0 [] xlog_grant_head_check+0xbd/0x140 [] xfs_log_reserve+0x103/0x220 [] xfs_trans_reserve+0x2f5/0x310 ..... The 4 bytes is significant. Brain Foster did all the hard work in tracking down a reproducable leak to inode chunk allocation (it went away with the ikeep mount option). His rough numbers were that creating 50,000 inodes leaked 11 log blocks. This turns out to be roughly 800 inode chunks or 1600 inode cluster buffers. That works out at roughly 4 bytes per cluster buffer logged, and at that I started looking for a 4 byte leak in the buffer logging code. What I found was that a struct xfs_buf_log_format structure for an inode cluster buffer is 28 bytes in length. This gets rounded up to 32 bytes, but the vector length remains 28 bytes. Hence the CIL ticket reservation is decremented by 32 bytes (via lv->lv_buf_len) for that vector rather than 28 bytes which are written into the log. The fix for this problem is to separately track the bytes used by the log vectors in the item and use that instead of the buffer length when accounting for the log space that will be used by the formatted log item. Again, thanks to Brian Foster for doing all the hard work and long hours to isolate this leak and make finding the bug relatively simple. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.h | 19 +++++++++++++------ fs/xfs/xfs_log_cil.c | 7 ++++--- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 2c4004475e71..84e0deb95abd 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -24,7 +24,8 @@ struct xfs_log_vec { struct xfs_log_iovec *lv_iovecp; /* iovec array */ struct xfs_log_item *lv_item; /* owner */ char *lv_buf; /* formatted buffer */ - int lv_buf_len; /* size of formatted buffer */ + int lv_bytes; /* accounted space in buffer */ + int lv_buf_len; /* aligned size of buffer */ int lv_size; /* size of allocated lv */ }; @@ -52,15 +53,21 @@ xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, return vec->i_addr; } +/* + * We need to make sure the next buffer is naturally aligned for the biggest + * basic data type we put into it. We already accounted for this padding when + * sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + */ static inline void xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) { - /* - * We need to make sure the next buffer is naturally aligned for the - * biggest basic data type we put into it. We already accounted for - * this when sizing the buffer. - */ lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + lv->lv_bytes += len; vec->i_len = len; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 7e5455391176..de835da6764d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -97,7 +97,7 @@ xfs_cil_prepare_item( { /* Account for the new LV being passed in */ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { - *diff_len += lv->lv_buf_len; + *diff_len += lv->lv_bytes; *diff_iovecs += lv->lv_niovecs; } @@ -111,7 +111,7 @@ xfs_cil_prepare_item( else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); - *diff_len -= old_lv->lv_buf_len; + *diff_len -= old_lv->lv_bytes; *diff_iovecs -= old_lv->lv_niovecs; kmem_free(old_lv); } @@ -239,7 +239,7 @@ xlog_cil_insert_format_items( * that the space reservation accounting is correct. */ *diff_iovecs -= lv->lv_niovecs; - *diff_len -= lv->lv_buf_len; + *diff_len -= lv->lv_bytes; } else { /* allocate new data chunk */ lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); @@ -259,6 +259,7 @@ xlog_cil_insert_format_items( /* The allocated data region lies beyond the iovec region */ lv->lv_buf_len = 0; + lv->lv_bytes = 0; lv->lv_buf = (char *)lv + buf_size - nbytes; ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); -- cgit v1.2.3 From 7c166350b15cbec4ed9357563461b6e1d2a44ea9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 20 May 2014 08:23:06 +1000 Subject: xfs: remove redundant checks from xfs_da_read_buf All of the verification checks of magic numbers are now done by verifiers, so ther eis no need to check them again once the buffer has been successfully read. If the magic number is bad, it won't even get to that code to verify it so it really serves no purpose at all anymore. Remove it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_da_btree.c | 41 ----------------------------------------- 1 file changed, 41 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 6cc5f6785a77..99b980db3419 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2625,47 +2625,6 @@ xfs_da_read_buf( xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); else xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); - - /* - * This verification code will be moved to a CRC verification callback - * function so just leave it here unchanged until then. - */ - { - xfs_dir2_data_hdr_t *hdr = bp->b_addr; - xfs_dir2_free_t *free = bp->b_addr; - xfs_da_blkinfo_t *info = bp->b_addr; - uint magic, magic1; - struct xfs_mount *mp = dp->i_mount; - - magic = be16_to_cpu(info->magic); - magic1 = be32_to_cpu(hdr->magic); - if (unlikely( - XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && - (magic != XFS_DA3_NODE_MAGIC) && - (magic != XFS_ATTR_LEAF_MAGIC) && - (magic != XFS_ATTR3_LEAF_MAGIC) && - (magic != XFS_DIR2_LEAF1_MAGIC) && - (magic != XFS_DIR3_LEAF1_MAGIC) && - (magic != XFS_DIR2_LEAFN_MAGIC) && - (magic != XFS_DIR3_LEAFN_MAGIC) && - (magic1 != XFS_DIR2_BLOCK_MAGIC) && - (magic1 != XFS_DIR3_BLOCK_MAGIC) && - (magic1 != XFS_DIR2_DATA_MAGIC) && - (magic1 != XFS_DIR3_DATA_MAGIC) && - (free->hdr.magic != - cpu_to_be32(XFS_DIR2_FREE_MAGIC)) && - (free->hdr.magic != - cpu_to_be32(XFS_DIR3_FREE_MAGIC)), - mp, XFS_ERRTAG_DA_READ_BUF, - XFS_RANDOM_DA_READ_BUF))) { - trace_xfs_da_btree_corrupt(bp, _RET_IP_); - XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", - XFS_ERRLEVEL_LOW, mp, info); - error = XFS_ERROR(EFSCORRUPTED); - xfs_trans_brelse(trans, bp); - goto out_free; - } - } *bpp = bp; out_free: if (mapp != &map) -- cgit v1.2.3 From 8695d27ec34b19c58a0dc25bfcce3f2c6cf0699d Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Tue, 20 May 2014 08:24:26 +1000 Subject: xfs: fix infinite loop at xfs_vm_writepage on 32bit system Write to a file with an offset greater than 16TB on 32-bit system and then trigger page write-back via sync(1) will cause task hang. # block_size=4096 # offset=$(((2**32 - 1) * $block_size)) # xfs_io -f -c "pwrite $offset $block_size" /storage/test_file # sync INFO: task sync:2590 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. sync D c1064a28 0 2590 2097 0x00000000 ..... Call Trace: [] ? ttwu_do_wakeup+0x18/0x130 [] ? try_to_wake_up+0x1ce/0x220 [] ? wake_up_process+0x1f/0x40 [] ? wake_up_worker+0x1e/0x30 [] schedule+0x23/0x60 [] schedule_timeout+0x18d/0x1f0 [] ? do_raw_spin_unlock+0x4e/0x90 [] ? __queue_delayed_work+0x91/0x150 [] ? do_raw_spin_lock+0x3f/0x100 [] ? do_raw_spin_unlock+0x4e/0x90 [] wait_for_completion+0x7d/0xc0 [] ? try_to_wake_up+0x220/0x220 [] sync_inodes_sb+0x92/0x180 [] sync_inodes_one_sb+0x15/0x20 [] iterate_supers+0xb8/0xc0 [] ? fdatawrite_one_bdev+0x20/0x20 [] sys_sync+0x31/0x80 [] sysenter_do_call+0x12/0x28 This issue can be triggered via xfstests/generic/308. The reason is that the end_index is unsigned long with maximum value '2^32-1=4294967295' on 32-bit platform, and the given offset cause it wrapped to 0, so that the following codes will repeat again and again until the task schedule time out: end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; if (page->index >= end_index) { unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); /* * Just skip the page if it is fully outside i_size, e.g. due * to a truncate operation that is in progress. */ if (page->index >= end_index + 1 || offset_into_page == 0) { ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ unlock_page(page); return 0; } In order to check if a page is fully outsids i_size or not, we can fix the code logic as below: if (page->index > end_index || (page->index == end_index && offset_into_page == 0)) Secondly, there still has another similar issue when calculating the end offset for mapping the filesystem blocks to the file blocks for delalloc. With the same tests to above, run unmount(8) will cause kernel panic if CONFIG_XFS_DEBUG is enabled: XFS: Assertion failed: XFS_FORCED_SHUTDOWN(ip->i_mount) || \ ip->i_delayed_blks == 0, file: fs/xfs/xfs_super.c, line: 964 kernel BUG at fs/xfs/xfs_message.c:108! invalid opcode: 0000 [#1] SMP task: edddc100 ti: ec6ee000 task.ti: ec6ee000 EIP: 0060:[] EFLAGS: 00010296 CPU: 1 EIP is at assfail+0x2b/0x30 [xfs] .............. Call Trace: [] xfs_fs_destroy_inode+0x74/0x120 [xfs] [] destroy_inode+0x31/0x50 [] evict+0xef/0x170 [] dispose_list+0x32/0x40 [] evict_inodes+0xca/0xe0 [] generic_shutdown_super+0x46/0xd0 [] kill_block_super+0x29/0x70 [] deactivate_locked_super+0x44/0x70 [] deactivate_super+0x47/0x60 [] mntput_no_expire+0xcd/0x120 [] SyS_umount+0xa8/0x370 [] SyS_oldumount+0x1e/0x20 [] sysenter_do_call+0x12/0x28 That because the end_offset is evaluated to 0 which is the same reason to above, hence the mapping and covertion for dealloc file blocks to file system blocks did not happened. This patch just fixed both issues. Reported-by: Michael L. Semon Signed-off-by: Jie Liu Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 49 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 0479c32c5eb1..d1b99b692ccb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -982,7 +982,32 @@ xfs_vm_writepage( offset = i_size_read(inode); end_index = offset >> PAGE_CACHE_SHIFT; last_index = (offset - 1) >> PAGE_CACHE_SHIFT; - if (page->index >= end_index) { + + /* + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + if (page->index < end_index) + end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); /* @@ -990,24 +1015,36 @@ xfs_vm_writepage( * truncate operation that is in progress. We must redirty the * page so that reclaim stops reclaiming it. Otherwise * xfs_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. */ - if (page->index >= end_index + 1 || offset_into_page == 0) + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) goto redirty; /* * The page straddles i_size. It must be zeroed out on each * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining + * that is not a multiple of the page size, the remaining * memory is zeroed when mapped, and writes to that region are * not written out to the file." */ zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; } - end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, - offset); len = 1 << inode->i_blkbits; bh = head = page_buffers(page); -- cgit v1.2.3 From 376c2f3a5f0706868b08ccf043bf3532936a03b1 Mon Sep 17 00:00:00 2001 From: Roger Willcocks Date: Tue, 20 May 2014 08:52:21 +1000 Subject: xfs: fix compile error when libxfs header used in C++ code xfs_ialloc.h:102: error: expected ',' or '...' before 'delete' Simple parameter rename, no changes to behaviour. Signed-off-by: Roger Willcocks Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_ialloc.c | 6 +++--- fs/xfs/xfs_ialloc.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 8f711db61a0c..b819263b7f2e 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1109,7 +1109,7 @@ xfs_difree( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ xfs_bmap_free_t *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ + int *deleted, /* set if inode cluster was deleted */ xfs_ino_t *first_ino) /* first inode in deleted cluster */ { /* REFERENCED */ @@ -1209,7 +1209,7 @@ xfs_difree( if (!(mp->m_flags & XFS_MOUNT_IKEEP) && (rec.ir_freecount == mp->m_ialloc_inos)) { - *delete = 1; + *deleted = 1; *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); /* @@ -1237,7 +1237,7 @@ xfs_difree( XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), mp->m_ialloc_blks, flist, mp); } else { - *delete = 0; + *deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 812365d17e67..95ad1c002d60 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -90,7 +90,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ + int *deleted, /* set if inode cluster was deleted */ xfs_ino_t *first_ino); /* first inode in deleted cluster */ /* -- cgit v1.2.3