From cb0a8d23024e7bd234dea4d0fc5c4902a8dda766 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Mar 2018 17:03:28 -0800 Subject: xfs: fall back to vmalloc when allocation log vector buffers When using large directory blocks, we regularly see memory allocations of >64k being made for the shadow log vector buffer. When we are under memory pressure, kmalloc() may not be able to find contiguous memory chunks large enough to satisfy these allocations easily, and if memory is fragmented we can potentially stall here. TO avoid this problem, switch the log vector buffer allocation to use kmem_alloc_large(). This will allow failed allocations to fall back to vmalloc and so remove the dependency on large contiguous regions of memory being available. This should prevent slowdowns and potential stalls when memory is low and/or fragmented. Signed-Off-By: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/kmem.c | 6 +++--- fs/xfs/kmem.h | 8 +++++++- fs/xfs/xfs_log_cil.c | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 393b6849aeb3..7bace03dc9dc 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -46,13 +46,13 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) } void * -kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +kmem_alloc_large(size_t size, xfs_km_flags_t flags) { unsigned nofs_flag = 0; void *ptr; gfp_t lflags; - ptr = kmem_zalloc(size, flags | KM_MAYFAIL); + ptr = kmem_alloc(size, flags | KM_MAYFAIL); if (ptr) return ptr; @@ -67,7 +67,7 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); - ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL); + ptr = __vmalloc(size, lflags, PAGE_KERNEL); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 4b87472f35bc..6023b594ead7 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -71,7 +71,7 @@ kmem_flags_convert(xfs_km_flags_t flags) } extern void *kmem_alloc(size_t, xfs_km_flags_t); -extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); +extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); extern void *kmem_realloc(const void *, size_t, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { @@ -85,6 +85,12 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) return kmem_alloc(size, flags | KM_ZERO); } +static inline void * +kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +{ + return kmem_alloc_large(size, flags | KM_ZERO); +} + /* * Zone interfaces */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 43aa42a3a5d3..cb376ac8a595 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -202,7 +202,7 @@ xlog_cil_alloc_shadow_bufs( */ kmem_free(lip->li_lv_shadow); - lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS); + lv = kmem_alloc_large(buf_size, KM_SLEEP | KM_NOFS); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; -- cgit v1.2.3 From 4603fa744c0702c76d607e5fafe35655b77a6efd Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 6 Mar 2018 17:03:29 -0800 Subject: xfs: remove unused m_dmevmask from xfs_mount struct The dmevmask structure member is a dmapi leftover; it's set here and there but never actually used. Remove it. Signed-off-by: Eric Sandeen Reviewed-by: Bill O'Donnell Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 9 --------- fs/xfs/xfs_mount.c | 2 -- fs/xfs/xfs_mount.h | 1 - 3 files changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 00240c9ee72e..1937a93db6e7 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -5127,16 +5127,9 @@ xlog_recover_process_iunlinks( xfs_agino_t agino; int bucket; int error; - uint mp_dmevmask; mp = log->l_mp; - /* - * Prevent any DMAPI event from being sent while in this function. - */ - mp_dmevmask = mp->m_dmevmask; - mp->m_dmevmask = 0; - for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { /* * Find the agi for this ag. @@ -5172,8 +5165,6 @@ xlog_recover_process_iunlinks( } xfs_buf_rele(agibp); } - - mp->m_dmevmask = mp_dmevmask; } STATIC int diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 98fd41cbb9e1..6f5a5e6764d8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -803,8 +803,6 @@ xfs_mountfs( get_unaligned_be16(&sbp->sb_uuid.b[4]); mp->m_fixedfsid[1] = get_unaligned_be32(&sbp->sb_uuid.b[0]); - mp->m_dmevmask = 0; /* not persistent; set after each mount */ - error = xfs_da_mount(mp); if (error) { xfs_warn(mp, "Failed dir/attr init: %d", error); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e0792d036be2..44a63c5f43c8 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -138,7 +138,6 @@ typedef struct xfs_mount { spinlock_t m_perag_lock; /* lock for m_perag_tree */ struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ - uint m_dmevmask; /* DMI events for this FS */ uint64_t m_flags; /* global mount flags */ bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ -- cgit v1.2.3 From e157ebdcb3acd16221f1e5f84c6e371e15d37b6e Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 6 Mar 2018 17:03:30 -0800 Subject: Cleanup old XFS_BTREE_* traces Remove unused legacy btree traces from IRIX era. Signed-off-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc_btree.c | 8 +-- fs/xfs/libxfs/xfs_bmap_btree.c | 4 +- fs/xfs/libxfs/xfs_btree.c | 125 +++---------------------------------- fs/xfs/libxfs/xfs_btree.h | 19 ------ fs/xfs/libxfs/xfs_ialloc_btree.c | 9 +-- fs/xfs/libxfs/xfs_refcount_btree.c | 5 -- fs/xfs/libxfs/xfs_rmap_btree.c | 8 +-- 7 files changed, 14 insertions(+), 164 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 6840b588187e..b451649ba176 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -74,18 +74,13 @@ xfs_allocbt_alloc_block( int error; xfs_agblock_t bno; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* Allocate the new block from the freelist. If we can't, give up. */ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, &bno, 1); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } if (bno == NULLAGBLOCK) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -95,7 +90,6 @@ xfs_allocbt_alloc_block( xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 9faf479aba49..d89d06bea6e3 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -272,10 +272,10 @@ xfs_bmbt_alloc_block( cur->bc_private.b.dfops->dop_low = true; } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } + ASSERT(args.len == 1); cur->bc_private.b.firstblock = args.fsbno; cur->bc_private.b.allocated++; @@ -286,12 +286,10 @@ xfs_bmbt_alloc_block( new->l = cpu_to_be64(args.fsbno); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 79ee4a1951d1..edc0193358a5 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1438,8 +1438,6 @@ xfs_btree_log_keys( int first, int last) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); @@ -1450,8 +1448,6 @@ xfs_btree_log_keys( xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, xfs_ilog_fbroot(cur->bc_private.b.whichfork)); } - - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); } /* @@ -1464,15 +1460,12 @@ xfs_btree_log_recs( int first, int last) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(cur->bc_tp, bp, xfs_btree_rec_offset(cur, first), xfs_btree_rec_offset(cur, last + 1) - 1); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); } /* @@ -1485,8 +1478,6 @@ xfs_btree_log_ptrs( int first, /* index of first pointer to log */ int last) /* index of last pointer to log */ { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); if (bp) { struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); @@ -1501,7 +1492,6 @@ xfs_btree_log_ptrs( xfs_ilog_fbroot(cur->bc_private.b.whichfork)); } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); } /* @@ -1543,9 +1533,6 @@ xfs_btree_log_block( XFS_BTREE_LBLOCK_CRC_LEN }; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGBI(cur, bp, fields); - if (bp) { int nbits; @@ -1573,8 +1560,6 @@ xfs_btree_log_block( xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, xfs_ilog_fbroot(cur->bc_private.b.whichfork)); } - - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); } /* @@ -1593,9 +1578,6 @@ xfs_btree_increment( int error; /* error return value */ int lev; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - ASSERT(level < cur->bc_nlevels); /* Read-ahead to the right at this level. */ @@ -1671,17 +1653,14 @@ xfs_btree_increment( cur->bc_ptrs[lev] = 1; } out1: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -1701,9 +1680,6 @@ xfs_btree_decrement( int lev; union xfs_btree_ptr ptr; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - ASSERT(level < cur->bc_nlevels); /* Read-ahead to the left at this level. */ @@ -1769,17 +1745,14 @@ xfs_btree_decrement( cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); } out1: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -1881,9 +1854,6 @@ xfs_btree_lookup( union xfs_btree_ptr *pp; /* ptr to btree block */ union xfs_btree_ptr ptr; /* ptr to btree block */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, dir); - XFS_BTREE_STATS_INC(cur, lookup); /* No such thing as a zero-level tree. */ @@ -1929,7 +1899,6 @@ xfs_btree_lookup( ASSERT(level == 0 && cur->bc_nlevels == 1); cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -2004,7 +1973,6 @@ xfs_btree_lookup( if (error) goto error0; XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; } @@ -2019,11 +1987,9 @@ xfs_btree_lookup( *stat = 1; else *stat = 0; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -2169,10 +2135,8 @@ __xfs_btree_updkeys( trace_xfs_btree_updkeys(cur, level, bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } #endif ptr = cur->bc_ptrs[level]; nlkey = xfs_btree_key_addr(cur, ptr, block); @@ -2224,9 +2188,6 @@ xfs_btree_update_keys( if (cur->bc_flags & XFS_BTREE_OVERLAPPING) return __xfs_btree_updkeys(cur, level, block, bp, false); - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGIK(cur, level, keyp); - /* * Go up the tree from this level toward the root. * At each level, update the key value to the value input. @@ -2241,10 +2202,8 @@ xfs_btree_update_keys( block = xfs_btree_get_block(cur, level, &bp); #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } #endif ptr = cur->bc_ptrs[level]; kp = xfs_btree_key_addr(cur, ptr, block); @@ -2252,7 +2211,6 @@ xfs_btree_update_keys( xfs_btree_log_keys(cur, bp, ptr, ptr); } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; } @@ -2272,9 +2230,6 @@ xfs_btree_update( int ptr; union xfs_btree_rec *rp; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGR(cur, rec); - /* Pick up the current block. */ block = xfs_btree_get_block(cur, 0, &bp); @@ -2307,11 +2262,9 @@ xfs_btree_update( goto error0; } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -2339,9 +2292,6 @@ xfs_btree_lshift( int error; /* error return value */ int i; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) goto out0; @@ -2500,21 +2450,17 @@ xfs_btree_lshift( /* Slide the cursor value left one. */ cur->bc_ptrs[level]--; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; error1: - XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } @@ -2541,9 +2487,6 @@ xfs_btree_rshift( int error; /* error return value */ int i; /* loop counter */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && (level == cur->bc_nlevels - 1)) goto out0; @@ -2676,21 +2619,17 @@ xfs_btree_rshift( xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; error1: - XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } @@ -2726,9 +2665,6 @@ __xfs_btree_split( int i; #endif - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key); - XFS_BTREE_STATS_INC(cur, split); /* Set up left block (current one). */ @@ -2878,16 +2814,13 @@ __xfs_btree_split( (*curp)->bc_ptrs[level + 1]++; } *ptrp = rptr; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -2994,7 +2927,6 @@ xfs_btree_new_iroot( int i; /* loop counter */ #endif - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_STATS_INC(cur, newroot); ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); @@ -3008,10 +2940,9 @@ xfs_btree_new_iroot( error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); if (error) goto error0; - if (*stat == 0) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + if (*stat == 0) return 0; - } + XFS_BTREE_STATS_INC(cur, alloc); /* Copy the root into a real block. */ @@ -3074,10 +3005,8 @@ xfs_btree_new_iroot( *logflags |= XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); *stat = 1; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -3102,7 +3031,6 @@ xfs_btree_new_root( union xfs_btree_ptr rptr; union xfs_btree_ptr lptr; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_STATS_INC(cur, newroot); /* initialise our start point from the cursor */ @@ -3202,14 +3130,11 @@ xfs_btree_new_root( xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); cur->bc_ptrs[cur->bc_nlevels] = nptr; cur->bc_nlevels++; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -3230,7 +3155,7 @@ xfs_btree_make_block_unfull( if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && level == cur->bc_nlevels - 1) { - struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_inode *ip = cur->bc_private.b.ip; if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { /* A root block that can be made bigger. */ @@ -3309,9 +3234,6 @@ xfs_btree_insrec( #endif xfs_daddr_t old_bn; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); - ncur = NULL; lkey = &nkey; @@ -3324,14 +3246,12 @@ xfs_btree_insrec( error = xfs_btree_new_root(cur, stat); xfs_btree_set_ptr_null(cur, ptrp); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return error; } /* If we're off the left edge, return failure. */ ptr = cur->bc_ptrs[level]; if (ptr == 0) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -3489,12 +3409,10 @@ xfs_btree_insrec( *curp = ncur; } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -3572,11 +3490,9 @@ xfs_btree_insert( } } while (!xfs_btree_ptr_is_null(cur, &nptr)); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = i; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } @@ -3611,8 +3527,6 @@ xfs_btree_kill_iroot( int i; #endif - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); ASSERT(cur->bc_nlevels > 1); @@ -3670,19 +3584,15 @@ xfs_btree_kill_iroot( #ifdef DEBUG for (i = 0; i < numrecs; i++) { error = xfs_btree_check_ptr(cur, cpp, i, level - 1); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } } #endif xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); error = xfs_btree_free_block(cur, cbp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } cur->bc_bufs[level - 1] = NULL; be16_add_cpu(&block->bb_level, -1); @@ -3690,7 +3600,6 @@ xfs_btree_kill_iroot( XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); cur->bc_nlevels--; out0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; } @@ -3706,7 +3615,6 @@ xfs_btree_kill_root( { int error; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); XFS_BTREE_STATS_INC(cur, killroot); /* @@ -3716,16 +3624,13 @@ xfs_btree_kill_root( cur->bc_ops->set_root(cur, newroot, -1); error = xfs_btree_free_block(cur, bp); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } cur->bc_bufs[level] = NULL; cur->bc_ra[level] = 0; cur->bc_nlevels--; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); return 0; } @@ -3744,7 +3649,6 @@ xfs_btree_dec_cursor( return error; } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; } @@ -3780,15 +3684,11 @@ xfs_btree_delrec( struct xfs_btree_cur *tcur; /* temporary btree cursor */ int numrecs; /* temporary numrec count */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - XFS_BTREE_TRACE_ARGI(cur, level); - tcur = NULL; /* Get the index of the entry being deleted, check for nothing there. */ ptr = cur->bc_ptrs[level]; if (ptr == 0) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -3805,7 +3705,6 @@ xfs_btree_delrec( /* Fail if we're off the end of the block. */ if (ptr > numrecs) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -4080,7 +3979,7 @@ xfs_btree_delrec( tcur = NULL; if (level == 0) cur->bc_ptrs[0]++; - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; return 0; } @@ -4250,13 +4149,11 @@ xfs_btree_delrec( * call updkeys directly. */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); /* Return value means the next level up has something to do. */ *stat = 2; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); if (tcur) xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; @@ -4277,8 +4174,6 @@ xfs_btree_delete( int i; bool joined = false; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* * Go up the tree, starting at leaf level. * @@ -4314,11 +4209,9 @@ xfs_btree_delete( } } - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = i; return 0; error0: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 50440b5618e8..58e30c0975c3 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -473,25 +473,6 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) #define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) -/* - * Trace hooks. Currently not implemented as they need to be ported - * over to the generic tracing functionality, which is some effort. - * - * i,j = integer (32 bit) - * b = btree block buffer (xfs_buf_t) - * p = btree ptr - * r = btree record - * k = btree key - */ -#define XFS_BTREE_TRACE_ARGBI(c, b, i) -#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) -#define XFS_BTREE_TRACE_ARGI(c, i) -#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) -#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) -#define XFS_BTREE_TRACE_ARGIK(c, i, k) -#define XFS_BTREE_TRACE_ARGR(c, r) -#define XFS_BTREE_TRACE_CURSOR(c, t) - xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index af197a5f3a82..a2dd7f4a2719 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -93,8 +93,6 @@ __xfs_inobt_alloc_block( int error; /* error return value */ xfs_agblock_t sbno = be32_to_cpu(start->s); - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; @@ -107,17 +105,14 @@ __xfs_inobt_alloc_block( args.resv = resv; error = xfs_alloc_vextent(&args); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } + if (args.fsbno == NULLFSBLOCK) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } ASSERT(args.len == 1); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 8479769e470d..265fdcefcbae 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -79,8 +79,6 @@ xfs_refcountbt_alloc_block( struct xfs_alloc_arg args; /* block allocation args */ int error; /* error return value */ - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - memset(&args, 0, sizeof(args)); args.tp = cur->bc_tp; args.mp = cur->bc_mp; @@ -98,7 +96,6 @@ xfs_refcountbt_alloc_block( trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, args.agbno, 1); if (args.fsbno == NULLFSBLOCK) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -109,12 +106,10 @@ xfs_refcountbt_alloc_block( be32_add_cpu(&agf->agf_refcount_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; out_error: - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index e829c3e489ea..738df3f9b5f2 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -104,20 +104,15 @@ xfs_rmapbt_alloc_block( int error; xfs_agblock_t bno; - XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* Allocate the new block from the freelist. If we can't, give up. */ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, &bno, 1); - if (error) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (error) return error; - } trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno, bno, 1); if (bno == NULLAGBLOCK) { - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } @@ -130,7 +125,6 @@ xfs_rmapbt_alloc_block( be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); - XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; } -- cgit v1.2.3 From bcab2ebfa1ad9fb4b9a58df60a365b6efb33f1cd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 6 Mar 2018 17:03:30 -0800 Subject: xfs: Remove dead code from inode recover function The memcpy is guarded by a check which is performed a right before we call xfs_log_dinode_to_disk. At this point we are sure this check will always be false otherwise we would have errored out. So let's remove this dead weight. Signed-off-by: Nikolay Borisov Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_recover.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1937a93db6e7..93885f57978a 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3173,13 +3173,6 @@ xlog_recover_inode_pass2( /* recover the log dinode inode into the on disk inode */ xfs_log_dinode_to_disk(ldip, dip); - /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > isize) { - memcpy((char *)dip + isize, - item->ri_buf[1].i_addr + isize, - item->ri_buf[1].i_len - isize); - } - fields = in_f->ilf_fields; if (fields & XFS_ILOG_DEV) xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); -- cgit v1.2.3 From 0d07e5573ffb3f3b941ca4c2df44f386319b1686 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Mar 2018 17:03:31 -0800 Subject: fs: don't clear I_DIRTY_TIME before calling mark_inode_dirty_sync __mark_inode_dirty already takes care of that, and for the XFS lazytime implementation we need to know that ->dirty_inode was called because I_DIRTY_TIME was set. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/inode.c | 1 - fs/sync.c | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index ef362364d396..6295f1415761 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1533,7 +1533,6 @@ retry: if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) { if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) { atomic_inc(&inode->i_count); - inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); trace_writeback_lazytime_iput(inode); mark_inode_dirty_sync(inode); diff --git a/fs/sync.c b/fs/sync.c index 6e0a2cbaf6de..09f0259724ff 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -187,12 +187,8 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) if (!file->f_op->fsync) return -EINVAL; - if (!datasync && (inode->i_state & I_DIRTY_TIME)) { - spin_lock(&inode->i_lock); - inode->i_state &= ~I_DIRTY_TIME; - spin_unlock(&inode->i_lock); + if (!datasync && (inode->i_state & I_DIRTY_TIME)) mark_inode_dirty_sync(inode); - } return file->f_op->fsync(file, start, end, datasync); } EXPORT_SYMBOL(vfs_fsync_range); -- cgit v1.2.3 From c3b1b13190aec6d7450971b5ff10beaec04af558 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Mar 2018 17:04:00 -0800 Subject: xfs: implement the lazytime mount option Use the VFS dirty inode tracking for lazytime inodes only, and just log them in ->dirty_inode. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_iops.c | 13 ++++++++++++- fs/xfs/xfs_super.c | 23 +++++++++++++++++++++++ fs/xfs/xfs_trans_inode.c | 14 ++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 56475fcd76f2..8567951eff10 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -46,6 +46,7 @@ #include #include #include +#include /* * Directories have different lock order w.r.t. mmap_sem compared to regular @@ -1052,11 +1053,21 @@ xfs_vn_update_time( { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; + int log_flags = XFS_ILOG_TIMESTAMP; struct xfs_trans *tp; int error; trace_xfs_update_time(ip); + if (inode->i_sb->s_flags & SB_LAZYTIME) { + if (!((flags & S_VERSION) && + inode_maybe_inc_iversion(inode, false))) + return generic_update_time(inode, now, flags); + + /* Capture the iversion update that just occurred */ + log_flags |= XFS_ILOG_CORE; + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); if (error) return error; @@ -1070,7 +1081,7 @@ xfs_vn_update_time( inode->i_atime = *now; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + xfs_trans_log_inode(tp, ip, log_flags); return xfs_trans_commit(tp); } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 93588ea3d3d2..45b6f014c358 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1009,6 +1009,28 @@ xfs_fs_destroy_inode( xfs_inode_set_reclaim_tag(ip); } +static void +xfs_fs_dirty_inode( + struct inode *inode, + int flag) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + + if (!(inode->i_sb->s_flags & SB_LAZYTIME)) + return; + if (flag != I_DIRTY_SYNC || !(inode->i_state & I_DIRTY_TIME)) + return; + + if (xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp)) + return; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + xfs_trans_commit(tp); +} + /* * Slab object creation initialisation for the XFS inode. * This covers only the idempotent fields in the XFS inode; @@ -1789,6 +1811,7 @@ xfs_fs_free_cached_objects( static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, + .dirty_inode = xfs_fs_dirty_inode, .drop_inode = xfs_fs_drop_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 4a89da4b6fe7..07cea592dc01 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -98,9 +98,23 @@ xfs_trans_log_inode( xfs_inode_t *ip, uint flags) { + struct inode *inode = VFS_I(ip); + ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + /* + * Don't bother with i_lock for the I_DIRTY_TIME check here, as races + * don't matter - we either will need an extra transaction in 24 hours + * to log the timestamps, or will clear already cleared fields in the + * worst case. + */ + if (inode->i_state & (I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED)) { + spin_lock(&inode->i_lock); + inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + spin_unlock(&inode->i_lock); + } + /* * Record the specific change for fdatasync optimisation. This * allows fdatasync to skip log forces for inodes that are only -- cgit v1.2.3 From 4df0f7f145f2ac143039e2c569d17a25354f411d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Mar 2018 17:07:22 -0800 Subject: xfs: fix transaction allocation deadlock in IO path xfs_trans_alloc() does GFP_KERNEL allocation, and we can call it while holding pages locked for writeback in the ->writepages path. The memory allocation is allowed to wait on pages under writeback, and so can wait on pages that are tagged as writeback by the caller. This affects both pre-IO submission and post-IO submission paths. Hence xfs_setsize_trans_alloc(), xfs_reflink_end_cow(), xfs_iomap_write_unwritten() and xfs_reflink_cancel_cow_range(). xfs_iomap_write_unwritten() already does the right thing, but the others don't. Fix them. Signed-Off-By: Dave Chinner Fixes: 281627df3eb5 ("xfs: log file size updates at I/O completion time") Fixes: 43caeb187deb9 ("xfs: move mappings from cow fork to data fork after copy-write)" Reviewed-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 3 ++- fs/xfs/xfs_reflink.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9c6a830da0ee..a0afb6411417 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -209,7 +209,8 @@ xfs_setfilesize_trans_alloc( struct xfs_trans *tp; int error; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, + XFS_TRANS_NOFS, &tp); if (error) return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 270246943a06..8c16177b33d4 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -668,7 +668,7 @@ xfs_reflink_cancel_cow_range( /* Start a rolling transaction to remove the mappings */ error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - 0, 0, 0, &tp); + 0, 0, XFS_TRANS_NOFS, &tp); if (error) goto out; @@ -741,7 +741,7 @@ xfs_reflink_end_cow( (unsigned int)(end_fsb - offset_fsb), XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, - resblks, 0, XFS_TRANS_RESERVE, &tp); + resblks, 0, XFS_TRANS_RESERVE | XFS_TRANS_NOFS, &tp); if (error) goto out; -- cgit v1.2.3 From 19957a181608d25c8f4136652d0ea00b3738972d Mon Sep 17 00:00:00 2001 From: Vratislav Bendel Date: Tue, 6 Mar 2018 17:07:44 -0800 Subject: xfs: Correctly invert xfs_buftarg LRU isolation logic Due to an inverted logic mistake in xfs_buftarg_isolate() the xfs_buffers with zero b_lru_ref will take another trip around LRU, while isolating buffers with non-zero b_lru_ref. Additionally those isolated buffers end up right back on the LRU once they are released, because b_lru_ref remains elevated. Fix that circuitous route by leaving them on the LRU as originally intended. Signed-off-by: Vratislav Bendel Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index d1da2ee9e6db..ac669a10c62f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1708,7 +1708,7 @@ xfs_buftarg_isolate( * zero. If the value is already zero, we need to reclaim the * buffer, otherwise it gets another trip through the LRU. */ - if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { spin_unlock(&bp->b_lock); return LRU_ROTATE; } -- cgit v1.2.3 From 8241f7f983b972823431d762f7c3c9fe0f2a7b00 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 6 Mar 2018 17:08:30 -0800 Subject: xfs: don't iunlock the quota ip when quota block In xfs_qm_dqalloc, we join the locked quota inode to the transaction we use to allocate blocks. If the allocation or mapping fails, we're not allowed to unlock the inode because the transaction code is in charge of unlocking it for us. Therefore, remove the iunlock call to avoid blowing asserts about unbalanced locking + mount hang. Found by corrupting the AGF and allocating space in the filesystem (quotacheck) immediately after mount. The upcoming agfl wrapping fixup test will trigger this scenario. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_dquot.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 43572f8a1b8e..2410acc900f0 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -394,8 +394,6 @@ xfs_qm_dqalloc( error1: xfs_defer_cancel(&dfops); error0: - xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return error; } -- cgit v1.2.3 From 3f883f5bb197b6fe4e6f461362782aa7b0e89cb6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 6 Mar 2018 17:08:31 -0800 Subject: xfs: convert a few more directory asserts to corruption Yet another round of playing whack-a-mole with directory code that asserts on corrupt on-disk metadata when it really should be returning -EFSCORRUPTED instead of ASSERTing. Found by a xfs/391 crash while lastbit fuzzing of ltail.bestcount. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_dir2_leaf.c | 3 ++- fs/xfs/libxfs/xfs_dir2_node.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index d7e630f41f9c..d61d52da95a1 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -1415,7 +1415,8 @@ xfs_dir2_leaf_removename( oldbest = be16_to_cpu(bf[0].length); ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); bestsp = xfs_dir2_leaf_bests_p(ltp); - ASSERT(be16_to_cpu(bestsp[db]) == oldbest); + if (be16_to_cpu(bestsp[db]) != oldbest) + return -EFSCORRUPTED; /* * Mark the former data entry unused. */ diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 239d97a64296..0839ffe27e02 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -387,8 +387,9 @@ xfs_dir2_leaf_to_node( dp->d_ops->free_hdr_from_disk(&freehdr, free); leaf = lbp->b_addr; ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); - ASSERT(be32_to_cpu(ltp->bestcount) <= - (uint)dp->i_d.di_size / args->geo->blksize); + if (be32_to_cpu(ltp->bestcount) > + (uint)dp->i_d.di_size / args->geo->blksize) + return -EFSCORRUPTED; /* * Copy freespace entries from the leaf block to the new block. -- cgit v1.2.3 From 6231848c3aa5c7378fb0dd7a7fff53e9db7edd09 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 6 Mar 2018 17:08:31 -0800 Subject: xfs: check for cow blocks before trying to clear them There's no point in allocating a transaction and locking the inode in preparation to clear cow blocks if there actually are any cow fork extents. Therefore, move the xfs_reflink_cancel_cow_range hunk to xfs_inactive and check the cow ifp first. This makes inode reclamation run faster. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_inode.c | 5 +++++ fs/xfs/xfs_super.c | 9 --------- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 604ee384a00a..50fbbf5f5a7a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1872,6 +1872,7 @@ xfs_inactive( xfs_inode_t *ip) { struct xfs_mount *mp; + struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); int error; int truncate = 0; @@ -1892,6 +1893,10 @@ xfs_inactive( if (mp->m_flags & XFS_MOUNT_RDONLY) return; + /* Try to clean out the cow blocks if there are any. */ + if (xfs_is_reflink_inode(ip) && cow_ifp->if_bytes > 0) + xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); + if (VFS_I(ip)->i_nlink != 0) { /* * force is true because we are evicting an inode from the diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 45b6f014c358..951271f57d00 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -972,7 +972,6 @@ xfs_fs_destroy_inode( struct inode *inode) { struct xfs_inode *ip = XFS_I(inode); - int error; trace_xfs_destroy_inode(ip); @@ -980,14 +979,6 @@ xfs_fs_destroy_inode( XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); - if (xfs_is_reflink_inode(ip)) { - error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); - if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) - xfs_warn(ip->i_mount, -"Error %d while evicting CoW blocks for inode %llu.", - error, ip->i_ino); - } - xfs_inactive(ip); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); -- cgit v1.2.3 From a78ee256c325ecfaec13cafc41b315bd4e1dd518 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Mar 2018 17:08:32 -0800 Subject: xfs: convert XFS_AGFL_SIZE to a helper function The AGFL size calculation is about to get more complex, so lets turn the macro into a function first and remove the macro. Signed-off-by: Dave Chinner [darrick: forward port to newer kernel, simplify the helper] Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_alloc.c | 31 ++++++++++++++++++++++++------- fs/xfs/libxfs/xfs_alloc.h | 2 ++ fs/xfs/libxfs/xfs_format.h | 13 +------------ fs/xfs/scrub/agheader.c | 6 +++--- fs/xfs/xfs_fsops.c | 2 +- 5 files changed, 31 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c02781a4c091..36101e55ed07 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -53,6 +53,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +/* + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in + * the beginning of the block for a proper header with the location information + * and CRC. + */ +unsigned int +xfs_agfl_size( + struct xfs_mount *mp) +{ + unsigned int size = mp->m_sb.sb_sectsize; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + size -= sizeof(struct xfs_agfl); + + return size / sizeof(xfs_agblock_t); +} + unsigned int xfs_refc_block( struct xfs_mount *mp) @@ -550,7 +567,7 @@ xfs_agfl_verify( if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) return __this_address; - for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + for (i = 0; i < xfs_agfl_size(mp); i++) { if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) return __this_address; @@ -2266,7 +2283,7 @@ xfs_alloc_get_freelist( bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); - if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) + if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) agf->agf_flfirst = 0; pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); @@ -2377,7 +2394,7 @@ xfs_alloc_put_freelist( be32_to_cpu(agf->agf_seqno), &agflbp))) return error; be32_add_cpu(&agf->agf_fllast, 1); - if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) + if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) agf->agf_fllast = 0; pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); @@ -2395,7 +2412,7 @@ xfs_alloc_put_freelist( xfs_alloc_log_agf(tp, agbp, logflags); - ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); + ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; @@ -2428,9 +2445,9 @@ xfs_agf_verify( if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && + be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && + be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) return __this_address; if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 65a0cafe06e4..a311a2414a6b 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -26,6 +26,8 @@ struct xfs_trans; extern struct workqueue_struct *xfs_alloc_wq; +unsigned int xfs_agfl_size(struct xfs_mount *mp); + /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 1acb584fc5f7..42956d8d95ed 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -803,24 +803,13 @@ typedef struct xfs_agi { &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ (__be32 *)(bp)->b_addr) -/* - * Size of the AGFL. For CRC-enabled filesystes we steal a couple of - * slots in the beginning of the block for a proper header with the - * location information and CRC. - */ -#define XFS_AGFL_SIZE(mp) \ - (((mp)->m_sb.sb_sectsize - \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ - sizeof(struct xfs_agfl) : 0)) / \ - sizeof(xfs_agblock_t)) - typedef struct xfs_agfl { __be32 agfl_magicnum; __be32 agfl_seqno; uuid_t agfl_uuid; __be64 agfl_lsn; __be32 agfl_crc; - __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ + __be32 agfl_bno[]; /* actually xfs_agfl_size(mp) */ } __attribute__((packed)) xfs_agfl_t; #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 05c66e05ae20..018aabbd9394 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -80,7 +80,7 @@ xfs_scrub_walk_agfl( } /* first to the end */ - for (i = flfirst; i < XFS_AGFL_SIZE(mp); i++) { + for (i = flfirst; i < xfs_agfl_size(mp); i++) { error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); if (error) return error; @@ -664,7 +664,7 @@ xfs_scrub_agf( if (agfl_last > agfl_first) fl_count = agfl_last - agfl_first + 1; else - fl_count = XFS_AGFL_SIZE(mp) - agfl_first + agfl_last + 1; + fl_count = xfs_agfl_size(mp) - agfl_first + agfl_last + 1; if (agfl_count != 0 && fl_count != agfl_count) xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); @@ -791,7 +791,7 @@ xfs_scrub_agfl( /* Allocate buffer to ensure uniqueness of AGFL entries. */ agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); agflcount = be32_to_cpu(agf->agf_flcount); - if (agflcount > XFS_AGFL_SIZE(sc->mp)) { + if (agflcount > xfs_agfl_size(sc->mp)) { xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); goto out; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8b4545623e25..523792768080 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -217,7 +217,7 @@ xfs_growfs_data_private( } agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); - for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); error = xfs_bwrite(bp); -- cgit v1.2.3 From ae23395d8858a0c91de978a60b317ec8468b2aba Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 Mar 2018 17:30:34 -0800 Subject: inode: don't memset the inode address space twice Noticed when looking at why cycling 600k inodes/s through the inode cache was taking a total of 8% cpu in memset() during inode initialisation. There is no need to zero the inode.i_data structure twice. This increases single threaded bulkstat throughput from ~200,000 inodes/s to ~220,000 inodes/s, so we save a substantial amount of CPU time per inode init by doing this. Signed-Off-By: Dave Chinner Reviewed-by: Jan Kara Reviewed-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/inode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 6295f1415761..b153aeaa61ea 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -346,9 +346,8 @@ void inc_nlink(struct inode *inode) } EXPORT_SYMBOL(inc_nlink); -void address_space_init_once(struct address_space *mapping) +static void __address_space_init_once(struct address_space *mapping) { - memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT); spin_lock_init(&mapping->tree_lock); init_rwsem(&mapping->i_mmap_rwsem); @@ -356,6 +355,12 @@ void address_space_init_once(struct address_space *mapping) spin_lock_init(&mapping->private_lock); mapping->i_mmap = RB_ROOT_CACHED; } + +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + __address_space_init_once(mapping); +} EXPORT_SYMBOL(address_space_init_once); /* @@ -371,7 +376,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); - address_space_init_once(&inode->i_data); + __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } EXPORT_SYMBOL(inode_init_once); -- cgit v1.2.3 From 57e809561118a4db2e19d31282761ca062fd6014 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 7 Mar 2018 14:59:39 -0800 Subject: xfs: Rename xa_ elements to ail_ This is a simple rename, except that xa_ail becomes ail_head. Signed-off-by: Matthew Wilcox Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf_item.c | 10 ++-- fs/xfs/xfs_dquot.c | 4 +- fs/xfs/xfs_dquot_item.c | 11 ++-- fs/xfs/xfs_inode_item.c | 22 +++---- fs/xfs/xfs_log.c | 6 +- fs/xfs/xfs_log_recover.c | 80 ++++++++++++------------- fs/xfs/xfs_trans.c | 18 +++--- fs/xfs/xfs_trans_ail.c | 152 +++++++++++++++++++++++------------------------ fs/xfs/xfs_trans_buf.c | 4 +- fs/xfs/xfs_trans_priv.h | 42 ++++++------- 10 files changed, 175 insertions(+), 174 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 270ddb4d2313..82ad270e390e 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -460,7 +460,7 @@ xfs_buf_item_unpin( list_del_init(&bp->b_li_list); bp->b_iodone = NULL; } else { - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); ASSERT(bp->b_log_item == NULL); @@ -1057,12 +1057,12 @@ xfs_buf_do_callbacks_fail( lip = list_first_entry(&bp->b_li_list, struct xfs_log_item, li_bio_list); ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { if (lip->li_ops->iop_error) lip->li_ops->iop_error(lip, bp); } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } static bool @@ -1226,7 +1226,7 @@ xfs_buf_iodone( * * Either way, AIL is useless if we're forcing a shutdown. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } @@ -1246,7 +1246,7 @@ xfs_buf_resubmit_failed_buffers( /* * Clear XFS_LI_FAILED flag from all items before resubmit * - * XFS_LI_FAILED set/clear is protected by xa_lock, caller this + * XFS_LI_FAILED set/clear is protected by ail_lock, caller this * function already have it acquired */ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 2410acc900f0..a7daef9e16bf 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -918,7 +918,7 @@ xfs_qm_dqflush_done( (lip->li_flags & XFS_LI_FAILED))) { /* xfs_trans_ail_delete() drops the AIL lock. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); if (lip->li_lsn == qip->qli_flush_lsn) { xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); } else { @@ -928,7 +928,7 @@ xfs_qm_dqflush_done( */ if (lip->li_flags & XFS_LI_FAILED) xfs_clear_li_failed(lip); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } } diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 96eaa6933709..4b331e354da7 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -157,8 +157,9 @@ xfs_dquot_item_error( STATIC uint xfs_qm_dquot_logitem_push( struct xfs_log_item *lip, - struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock) - __acquires(&lip->li_ailp->xa_lock) + struct list_head *buffer_list) + __releases(&lip->li_ailp->ail_lock) + __acquires(&lip->li_ailp->ail_lock) { struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; struct xfs_buf *bp = lip->li_buf; @@ -205,7 +206,7 @@ xfs_qm_dquot_logitem_push( goto out_unlock; } - spin_unlock(&lip->li_ailp->xa_lock); + spin_unlock(&lip->li_ailp->ail_lock); error = xfs_qm_dqflush(dqp, &bp); if (error) { @@ -217,7 +218,7 @@ xfs_qm_dquot_logitem_push( xfs_buf_relse(bp); } - spin_lock(&lip->li_ailp->xa_lock); + spin_lock(&lip->li_ailp->ail_lock); out_unlock: xfs_dqunlock(dqp); return rval; @@ -400,7 +401,7 @@ xfs_qm_qoffend_logitem_committed( * Delete the qoff-start logitem from the AIL. * xfs_trans_ail_delete() drops the AIL lock. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); kmem_free(qfs->qql_item.li_lv_shadow); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d5037f060d6f..7666eae8844f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -502,8 +502,8 @@ STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, struct list_head *buffer_list) - __releases(&lip->li_ailp->xa_lock) - __acquires(&lip->li_ailp->xa_lock) + __releases(&lip->li_ailp->ail_lock) + __acquires(&lip->li_ailp->ail_lock) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; @@ -562,7 +562,7 @@ xfs_inode_item_push( ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); - spin_unlock(&lip->li_ailp->xa_lock); + spin_unlock(&lip->li_ailp->ail_lock); error = xfs_iflush(ip, &bp); if (!error) { @@ -571,7 +571,7 @@ xfs_inode_item_push( xfs_buf_relse(bp); } - spin_lock(&lip->li_ailp->xa_lock); + spin_lock(&lip->li_ailp->ail_lock); out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); return rval; @@ -759,7 +759,7 @@ xfs_iflush_done( bool mlip_changed = false; /* this is an opencoded batch version of xfs_trans_ail_delete */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); list_for_each_entry(blip, &tmp, li_bio_list) { if (INODE_ITEM(blip)->ili_logged && blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) @@ -770,15 +770,15 @@ xfs_iflush_done( } if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - xlog_assign_tail_lsn_locked(ailp->xa_mount); - if (list_empty(&ailp->xa_ail)) - wake_up_all(&ailp->xa_empty); + if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) + xlog_assign_tail_lsn_locked(ailp->ail_mount); + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (mlip_changed) - xfs_log_space_wake(ailp->xa_mount); + xfs_log_space_wake(ailp->ail_mount); } /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3e5ba1ecc080..2f529fd7bf67 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1149,7 +1149,7 @@ xlog_assign_tail_lsn_locked( struct xfs_log_item *lip; xfs_lsn_t tail_lsn; - assert_spin_locked(&mp->m_ail->xa_lock); + assert_spin_locked(&mp->m_ail->ail_lock); /* * To make sure we always have a valid LSN for the log tail we keep @@ -1172,9 +1172,9 @@ xlog_assign_tail_lsn( { xfs_lsn_t tail_lsn; - spin_lock(&mp->m_ail->xa_lock); + spin_lock(&mp->m_ail->ail_lock); tail_lsn = xlog_assign_tail_lsn_locked(mp); - spin_unlock(&mp->m_ail->xa_lock); + spin_unlock(&mp->m_ail->ail_lock); return tail_lsn; } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 93885f57978a..59134f626927 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3427,7 +3427,7 @@ xlog_recover_efi_pass2( } atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - spin_lock(&log->l_ailp->xa_lock); + spin_lock(&log->l_ailp->ail_lock); /* * The EFI has two references. One for the EFD and one for EFI to ensure * it makes it into the AIL. Insert the EFI into the AIL directly and @@ -3470,7 +3470,7 @@ xlog_recover_efd_pass2( * Search for the EFI with the id in the EFD format structure in the * AIL. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_EFI) { @@ -3480,9 +3480,9 @@ xlog_recover_efd_pass2( * Drop the EFD reference to the EFI. This * removes the EFI from the AIL and frees it. */ - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_efi_release(efip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); break; } } @@ -3490,7 +3490,7 @@ xlog_recover_efd_pass2( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return 0; } @@ -3523,7 +3523,7 @@ xlog_recover_rui_pass2( } atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents); - spin_lock(&log->l_ailp->xa_lock); + spin_lock(&log->l_ailp->ail_lock); /* * The RUI has two references. One for the RUD and one for RUI to ensure * it makes it into the AIL. Insert the RUI into the AIL directly and @@ -3563,7 +3563,7 @@ xlog_recover_rud_pass2( * Search for the RUI with the id in the RUD format structure in the * AIL. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_RUI) { @@ -3573,9 +3573,9 @@ xlog_recover_rud_pass2( * Drop the RUD reference to the RUI. This * removes the RUI from the AIL and frees it. */ - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_rui_release(ruip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); break; } } @@ -3583,7 +3583,7 @@ xlog_recover_rud_pass2( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return 0; } @@ -3639,7 +3639,7 @@ xlog_recover_cui_pass2( } atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents); - spin_lock(&log->l_ailp->xa_lock); + spin_lock(&log->l_ailp->ail_lock); /* * The CUI has two references. One for the CUD and one for CUI to ensure * it makes it into the AIL. Insert the CUI into the AIL directly and @@ -3680,7 +3680,7 @@ xlog_recover_cud_pass2( * Search for the CUI with the id in the CUD format structure in the * AIL. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_CUI) { @@ -3690,9 +3690,9 @@ xlog_recover_cud_pass2( * Drop the CUD reference to the CUI. This * removes the CUI from the AIL and frees it. */ - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_cui_release(cuip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); break; } } @@ -3700,7 +3700,7 @@ xlog_recover_cud_pass2( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return 0; } @@ -3758,7 +3758,7 @@ xlog_recover_bui_pass2( } atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents); - spin_lock(&log->l_ailp->xa_lock); + spin_lock(&log->l_ailp->ail_lock); /* * The RUI has two references. One for the RUD and one for RUI to ensure * it makes it into the AIL. Insert the RUI into the AIL directly and @@ -3799,7 +3799,7 @@ xlog_recover_bud_pass2( * Search for the BUI with the id in the BUD format structure in the * AIL. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_BUI) { @@ -3809,9 +3809,9 @@ xlog_recover_bud_pass2( * Drop the BUD reference to the BUI. This * removes the BUI from the AIL and frees it. */ - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_bui_release(buip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); break; } } @@ -3819,7 +3819,7 @@ xlog_recover_bud_pass2( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return 0; } @@ -4652,9 +4652,9 @@ xlog_recover_process_efi( if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) return 0; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); error = xfs_efi_recover(mp, efip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); return error; } @@ -4670,9 +4670,9 @@ xlog_recover_cancel_efi( efip = container_of(lip, struct xfs_efi_log_item, efi_item); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_efi_release(efip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } /* Recover the RUI if necessary. */ @@ -4692,9 +4692,9 @@ xlog_recover_process_rui( if (test_bit(XFS_RUI_RECOVERED, &ruip->rui_flags)) return 0; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); error = xfs_rui_recover(mp, ruip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); return error; } @@ -4710,9 +4710,9 @@ xlog_recover_cancel_rui( ruip = container_of(lip, struct xfs_rui_log_item, rui_item); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_rui_release(ruip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } /* Recover the CUI if necessary. */ @@ -4733,9 +4733,9 @@ xlog_recover_process_cui( if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags)) return 0; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); error = xfs_cui_recover(mp, cuip, dfops); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); return error; } @@ -4751,9 +4751,9 @@ xlog_recover_cancel_cui( cuip = container_of(lip, struct xfs_cui_log_item, cui_item); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_cui_release(cuip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } /* Recover the BUI if necessary. */ @@ -4774,9 +4774,9 @@ xlog_recover_process_bui( if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags)) return 0; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); error = xfs_bui_recover(mp, buip, dfops); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); return error; } @@ -4792,9 +4792,9 @@ xlog_recover_cancel_bui( buip = container_of(lip, struct xfs_bui_log_item, bui_item); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); xfs_bui_release(buip); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } /* Is this log item a deferred action intent? */ @@ -4882,7 +4882,7 @@ xlog_recover_process_intents( #endif ailp = log->l_ailp; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); @@ -4936,7 +4936,7 @@ xlog_recover_process_intents( } out: xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (error) xfs_defer_cancel(&dfops); else @@ -4959,7 +4959,7 @@ xlog_recover_cancel_intents( struct xfs_ail *ailp; ailp = log->l_ailp; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { /* @@ -4993,7 +4993,7 @@ xlog_recover_cancel_intents( } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return error; } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 86f92df32c42..ec6b01834705 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -803,8 +803,8 @@ xfs_log_item_batch_insert( { int i; - spin_lock(&ailp->xa_lock); - /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ + spin_lock(&ailp->ail_lock); + /* xfs_trans_ail_update_bulk drops ailp->ail_lock */ xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); for (i = 0; i < nr_items; i++) { @@ -847,9 +847,9 @@ xfs_trans_committed_bulk( struct xfs_ail_cursor cur; int i = 0; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); /* unpin all the log items */ for (lv = log_vector; lv; lv = lv->lv_next ) { @@ -869,7 +869,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount)); lip->li_ops->iop_unpin(lip, 1); continue; } @@ -883,11 +883,11 @@ xfs_trans_committed_bulk( * not affect the AIL cursor the bulk insert path is * using. */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) xfs_trans_ail_update(ailp, lip, item_lsn); else - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); lip->li_ops->iop_unpin(lip, 0); continue; } @@ -905,9 +905,9 @@ xfs_trans_committed_bulk( if (i) xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index cef89f7127d3..d4a2445215e6 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -40,7 +40,7 @@ xfs_ail_check( { xfs_log_item_t *prev_lip; - if (list_empty(&ailp->xa_ail)) + if (list_empty(&ailp->ail_head)) return; /* @@ -48,11 +48,11 @@ xfs_ail_check( */ ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->xa_ail) + if (&prev_lip->li_ail != &ailp->ail_head) ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->xa_ail) + if (&prev_lip->li_ail != &ailp->ail_head) ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); @@ -69,10 +69,10 @@ static xfs_log_item_t * xfs_ail_max( struct xfs_ail *ailp) { - if (list_empty(&ailp->xa_ail)) + if (list_empty(&ailp->ail_head)) return NULL; - return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail); + return list_entry(ailp->ail_head.prev, xfs_log_item_t, li_ail); } /* @@ -84,7 +84,7 @@ xfs_ail_next( struct xfs_ail *ailp, xfs_log_item_t *lip) { - if (lip->li_ail.next == &ailp->xa_ail) + if (lip->li_ail.next == &ailp->ail_head) return NULL; return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); @@ -105,11 +105,11 @@ xfs_ail_min_lsn( xfs_lsn_t lsn = 0; xfs_log_item_t *lip; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_ail_min(ailp); if (lip) lsn = lip->li_lsn; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return lsn; } @@ -124,11 +124,11 @@ xfs_ail_max_lsn( xfs_lsn_t lsn = 0; xfs_log_item_t *lip; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); lip = xfs_ail_max(ailp); if (lip) lsn = lip->li_lsn; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); return lsn; } @@ -146,7 +146,7 @@ xfs_trans_ail_cursor_init( struct xfs_ail_cursor *cur) { cur->item = NULL; - list_add_tail(&cur->list, &ailp->xa_cursors); + list_add_tail(&cur->list, &ailp->ail_cursors); } /* @@ -194,7 +194,7 @@ xfs_trans_ail_cursor_clear( { struct xfs_ail_cursor *cur; - list_for_each_entry(cur, &ailp->xa_cursors, list) { + list_for_each_entry(cur, &ailp->ail_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) ((uintptr_t)cur->item | 1); @@ -222,7 +222,7 @@ xfs_trans_ail_cursor_first( goto out; } - list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + list_for_each_entry(lip, &ailp->ail_head, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) goto out; } @@ -241,7 +241,7 @@ __xfs_trans_ail_cursor_last( { xfs_log_item_t *lip; - list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + list_for_each_entry_reverse(lip, &ailp->ail_head, li_ail) { if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) return lip; } @@ -310,7 +310,7 @@ xfs_ail_splice( if (lip) list_splice(list, &lip->li_ail); else - list_splice(list, &ailp->xa_ail); + list_splice(list, &ailp->ail_head); } /* @@ -335,17 +335,17 @@ xfsaild_push_item( * If log item pinning is enabled, skip the push and track the item as * pinned. This can help induce head-behind-tail conditions. */ - if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + if (XFS_TEST_ERROR(false, ailp->ail_mount, XFS_ERRTAG_LOG_ITEM_PIN)) return XFS_ITEM_PINNED; - return lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + return lip->li_ops->iop_push(lip, &ailp->ail_buf_list); } static long xfsaild_push( struct xfs_ail *ailp) { - xfs_mount_t *mp = ailp->xa_mount; + xfs_mount_t *mp = ailp->ail_mount; struct xfs_ail_cursor cur; xfs_log_item_t *lip; xfs_lsn_t lsn; @@ -360,30 +360,30 @@ xfsaild_push( * buffers the last time we ran, force the log first and wait for it * before pushing again. */ - if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 && - (!list_empty_careful(&ailp->xa_buf_list) || + if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 && + (!list_empty_careful(&ailp->ail_buf_list) || xfs_ail_min_lsn(ailp))) { - ailp->xa_log_flush = 0; + ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); xfs_log_force(mp, XFS_LOG_SYNC); } - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); - /* barrier matches the xa_target update in xfs_ail_push() */ + /* barrier matches the ail_target update in xfs_ail_push() */ smp_rmb(); - target = ailp->xa_target; - ailp->xa_target_prev = target; + target = ailp->ail_target; + ailp->ail_target_prev = target; - lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn); if (!lip) { /* * If the AIL is empty or our push has reached the end we are * done now. */ xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); goto out_done; } @@ -404,7 +404,7 @@ xfsaild_push( XFS_STATS_INC(mp, xs_push_ail_success); trace_xfs_ail_push(lip); - ailp->xa_last_pushed_lsn = lsn; + ailp->ail_last_pushed_lsn = lsn; break; case XFS_ITEM_FLUSHING: @@ -423,7 +423,7 @@ xfsaild_push( trace_xfs_ail_flushing(lip); flushing++; - ailp->xa_last_pushed_lsn = lsn; + ailp->ail_last_pushed_lsn = lsn; break; case XFS_ITEM_PINNED: @@ -431,7 +431,7 @@ xfsaild_push( trace_xfs_ail_pinned(lip); stuck++; - ailp->xa_log_flush++; + ailp->ail_log_flush++; break; case XFS_ITEM_LOCKED: XFS_STATS_INC(mp, xs_push_ail_locked); @@ -468,10 +468,10 @@ xfsaild_push( lsn = lip->li_lsn; } xfs_trans_ail_cursor_done(&cur); - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); - if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) - ailp->xa_log_flush++; + if (xfs_buf_delwri_submit_nowait(&ailp->ail_buf_list)) + ailp->ail_log_flush++; if (!count || XFS_LSN_CMP(lsn, target) >= 0) { out_done: @@ -481,7 +481,7 @@ out_done: * AIL before we start the next scan from the start of the AIL. */ tout = 50; - ailp->xa_last_pushed_lsn = 0; + ailp->ail_last_pushed_lsn = 0; } else if (((stuck + flushing) * 100) / count > 90) { /* * Either there is a lot of contention on the AIL or we are @@ -494,7 +494,7 @@ out_done: * the restart to issue a log force to unpin the stuck items. */ tout = 20; - ailp->xa_last_pushed_lsn = 0; + ailp->ail_last_pushed_lsn = 0; } else { /* * Assume we have more work to do in a short while. @@ -536,26 +536,26 @@ xfsaild( break; } - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); /* * Idle if the AIL is empty and we are not racing with a target * update. We check the AIL after we set the task to a sleep - * state to guarantee that we either catch an xa_target update + * state to guarantee that we either catch an ail_target update * or that a wake_up resets the state to TASK_RUNNING. * Otherwise, we run the risk of sleeping indefinitely. * - * The barrier matches the xa_target update in xfs_ail_push(). + * The barrier matches the ail_target update in xfs_ail_push(). */ smp_rmb(); if (!xfs_ail_min(ailp) && - ailp->xa_target == ailp->xa_target_prev) { - spin_unlock(&ailp->xa_lock); + ailp->ail_target == ailp->ail_target_prev) { + spin_unlock(&ailp->ail_lock); freezable_schedule(); tout = 0; continue; } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (tout) freezable_schedule_timeout(msecs_to_jiffies(tout)); @@ -592,8 +592,8 @@ xfs_ail_push( xfs_log_item_t *lip; lip = xfs_ail_min(ailp); - if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) || - XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0) + if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) || + XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return; /* @@ -601,10 +601,10 @@ xfs_ail_push( * the XFS_AIL_PUSHING_BIT. */ smp_wmb(); - xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); + xfs_trans_ail_copy_lsn(ailp, &ailp->ail_target, &threshold_lsn); smp_wmb(); - wake_up_process(ailp->xa_task); + wake_up_process(ailp->ail_task); } /* @@ -630,18 +630,18 @@ xfs_ail_push_all_sync( struct xfs_log_item *lip; DEFINE_WAIT(wait); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); while ((lip = xfs_ail_max(ailp)) != NULL) { - prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE); - ailp->xa_target = lip->li_lsn; - wake_up_process(ailp->xa_task); - spin_unlock(&ailp->xa_lock); + prepare_to_wait(&ailp->ail_empty, &wait, TASK_UNINTERRUPTIBLE); + ailp->ail_target = lip->li_lsn; + wake_up_process(ailp->ail_task); + spin_unlock(&ailp->ail_lock); schedule(); - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); - finish_wait(&ailp->xa_empty, &wait); + finish_wait(&ailp->ail_empty, &wait); } /* @@ -672,7 +672,7 @@ xfs_trans_ail_update_bulk( struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, - xfs_lsn_t lsn) __releases(ailp->xa_lock) + xfs_lsn_t lsn) __releases(ailp->ail_lock) { xfs_log_item_t *mlip; int mlip_changed = 0; @@ -705,13 +705,13 @@ xfs_trans_ail_update_bulk( xfs_ail_splice(ailp, cur, &tmp, lsn); if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - xlog_assign_tail_lsn_locked(ailp->xa_mount); - spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount)) + xlog_assign_tail_lsn_locked(ailp->ail_mount); + spin_unlock(&ailp->ail_lock); - xfs_log_space_wake(ailp->xa_mount); + xfs_log_space_wake(ailp->ail_mount); } else { - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } } @@ -756,13 +756,13 @@ void xfs_trans_ail_delete( struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->xa_lock) + int shutdown_type) __releases(ailp->ail_lock) { - struct xfs_mount *mp = ailp->xa_mount; + struct xfs_mount *mp = ailp->ail_mount; bool mlip_changed; if (!(lip->li_flags & XFS_LI_IN_AIL)) { - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", @@ -776,13 +776,13 @@ xfs_trans_ail_delete( if (mlip_changed) { if (!XFS_FORCED_SHUTDOWN(mp)) xlog_assign_tail_lsn_locked(mp); - if (list_empty(&ailp->xa_ail)) - wake_up_all(&ailp->xa_empty); + if (list_empty(&ailp->ail_head)) + wake_up_all(&ailp->ail_empty); } - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); if (mlip_changed) - xfs_log_space_wake(ailp->xa_mount); + xfs_log_space_wake(ailp->ail_mount); } int @@ -795,16 +795,16 @@ xfs_trans_ail_init( if (!ailp) return -ENOMEM; - ailp->xa_mount = mp; - INIT_LIST_HEAD(&ailp->xa_ail); - INIT_LIST_HEAD(&ailp->xa_cursors); - spin_lock_init(&ailp->xa_lock); - INIT_LIST_HEAD(&ailp->xa_buf_list); - init_waitqueue_head(&ailp->xa_empty); + ailp->ail_mount = mp; + INIT_LIST_HEAD(&ailp->ail_head); + INIT_LIST_HEAD(&ailp->ail_cursors); + spin_lock_init(&ailp->ail_lock); + INIT_LIST_HEAD(&ailp->ail_buf_list); + init_waitqueue_head(&ailp->ail_empty); - ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->xa_mount->m_fsname); - if (IS_ERR(ailp->xa_task)) + ailp->ail_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->ail_mount->m_fsname); + if (IS_ERR(ailp->ail_task)) goto out_free_ailp; mp->m_ail = ailp; @@ -821,6 +821,6 @@ xfs_trans_ail_destroy( { struct xfs_ail *ailp = mp->m_ail; - kthread_stop(ailp->xa_task); + kthread_stop(ailp->ail_task); kmem_free(ailp); } diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 653ce379d36b..a5d9dfc45d98 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -431,8 +431,8 @@ xfs_trans_brelse( * If the fs has shutdown and we dropped the last reference, it may fall * on us to release a (possibly dirty) bli if it never made it to the * AIL (e.g., the aborted unpin already happened and didn't release it - * due to our reference). Since we're already shutdown and need xa_lock, - * just force remove from the AIL and release the bli here. + * due to our reference). Since we're already shutdown and need + * ail_lock, just force remove from the AIL and release the bli here. */ if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index b317a3644c00..be24b0c8a332 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -65,17 +65,17 @@ struct xfs_ail_cursor { * Eventually we need to drive the locking in here as well. */ struct xfs_ail { - struct xfs_mount *xa_mount; - struct task_struct *xa_task; - struct list_head xa_ail; - xfs_lsn_t xa_target; - xfs_lsn_t xa_target_prev; - struct list_head xa_cursors; - spinlock_t xa_lock; - xfs_lsn_t xa_last_pushed_lsn; - int xa_log_flush; - struct list_head xa_buf_list; - wait_queue_head_t xa_empty; + struct xfs_mount *ail_mount; + struct task_struct *ail_task; + struct list_head ail_head; + xfs_lsn_t ail_target; + xfs_lsn_t ail_target_prev; + struct list_head ail_cursors; + spinlock_t ail_lock; + xfs_lsn_t ail_last_pushed_lsn; + int ail_log_flush; + struct list_head ail_buf_list; + wait_queue_head_t ail_empty; }; /* @@ -84,7 +84,7 @@ struct xfs_ail { void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, - xfs_lsn_t lsn) __releases(ailp->xa_lock); + xfs_lsn_t lsn) __releases(ailp->ail_lock); /* * Return a pointer to the first item in the AIL. If the AIL is empty, then * return NULL. @@ -93,7 +93,7 @@ static inline struct xfs_log_item * xfs_ail_min( struct xfs_ail *ailp) { - return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + return list_first_entry_or_null(&ailp->ail_head, struct xfs_log_item, li_ail); } @@ -101,14 +101,14 @@ static inline void xfs_trans_ail_update( struct xfs_ail *ailp, struct xfs_log_item *lip, - xfs_lsn_t lsn) __releases(ailp->xa_lock) + xfs_lsn_t lsn) __releases(ailp->ail_lock) { xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, - int shutdown_type) __releases(ailp->xa_lock); + int shutdown_type) __releases(ailp->ail_lock); static inline void xfs_trans_ail_remove( @@ -117,12 +117,12 @@ xfs_trans_ail_remove( { struct xfs_ail *ailp = lip->li_ailp; - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); /* xfs_trans_ail_delete() drops the AIL lock */ if (lip->li_flags & XFS_LI_IN_AIL) xfs_trans_ail_delete(ailp, lip, shutdown_type); else - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); @@ -149,9 +149,9 @@ xfs_trans_ail_copy_lsn( xfs_lsn_t *src) { ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ - spin_lock(&ailp->xa_lock); + spin_lock(&ailp->ail_lock); *dst = *src; - spin_unlock(&ailp->xa_lock); + spin_unlock(&ailp->ail_lock); } #else static inline void @@ -172,7 +172,7 @@ xfs_clear_li_failed( struct xfs_buf *bp = lip->li_buf; ASSERT(lip->li_flags & XFS_LI_IN_AIL); - lockdep_assert_held(&lip->li_ailp->xa_lock); + lockdep_assert_held(&lip->li_ailp->ail_lock); if (lip->li_flags & XFS_LI_FAILED) { lip->li_flags &= ~XFS_LI_FAILED; @@ -186,7 +186,7 @@ xfs_set_li_failed( struct xfs_log_item *lip, struct xfs_buf *bp) { - lockdep_assert_held(&lip->li_ailp->xa_lock); + lockdep_assert_held(&lip->li_ailp->ail_lock); if (!(lip->li_flags & XFS_LI_FAILED)) { xfs_buf_hold(bp); -- cgit v1.2.3 From 3e78b9a4689f722538d4d3457fc10c4bf1d9cf6a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 9 Mar 2018 14:01:58 -0800 Subject: xfs: shutdown if block allocation overruns tx reservation The ->t_blk_res_used field tracks how many blocks have been used in the current transaction. This should never exceed the block reservation (->t_blk_res) for a particular transaction. We currently assert this condition in the transaction block accounting code, but otherwise take no additional action should this situation occur. The overrun generally has no effect if space ends up being available and the associated transaction commits. If the transaction is duplicated, however, the current block usage is used to determine the remaining block reservation to be transferred to the new transaction. If usage exceeds reservation, this calculation underflows and creates a transaction with an invalid and excessive reservation. When the second transaction commits, the release of unused blocks corrupts the in-core free space counters. With lazy superblock accounting enabled, this inconsistency eventually trickles to the on-disk superblock and corrupts the filesystem. Replace the transaction block usage accounting assert with an explicit overrun check. If the transaction overruns the reservation, shutdown the filesystem immediately to prevent corruption. Add a new assert to xfs_trans_dup() to catch any callers that might induce this invalid state in the future. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ec6b01834705..2ad08ec2fc21 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -119,8 +119,11 @@ xfs_trans_dup( /* We gave our writer reference to the new transaction */ tp->t_flags |= XFS_TRANS_NO_WRITECOUNT; ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); + + ASSERT(tp->t_blk_res >= tp->t_blk_res_used); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; + ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; ntp->t_pflags = tp->t_pflags; @@ -344,13 +347,14 @@ xfs_trans_mod_sb( break; case XFS_TRANS_SB_FDBLOCKS: /* - * Track the number of blocks allocated in the - * transaction. Make sure it does not exceed the - * number reserved. + * Track the number of blocks allocated in the transaction. + * Make sure it does not exceed the number reserved. If so, + * shutdown as this can lead to accounting inconsistency. */ if (delta < 0) { tp->t_blk_res_used += (uint)-delta; - ASSERT(tp->t_blk_res_used <= tp->t_blk_res); + if (tp->t_blk_res_used > tp->t_blk_res) + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } tp->t_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) -- cgit v1.2.3 From b3fed434822d0873269231f77e23fc801da32569 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 9 Mar 2018 14:01:58 -0800 Subject: xfs: account format bouncing into rmapbt swapext tx reservation The extent swap mechanism requires a unique implementation for rmapbt enabled filesystems. Because the rmapbt tracks extent owner information, extent swap must individually unmap and remap each extent between the two inodes. The rmapbt extent swap transaction block reservation currently accounts for the worst case bmapbt block and rmapbt block consumption based on the extent count of each inode. There is a corner case that exists due to the extent swap implementation that is not covered by this reservation, however. If one of the associated inodes is just over the max extent count used for extent format inodes (i.e., the inode is in btree format by a single extent), the unmap/remap cycle of the extent swap can bounce the inode between extent and btree format multiple times, almost as many times as there are extents in the inode (if the opposing inode happens to have one less, for example). Each back and forth cycle involves a block free and allocation, which isn't a problem except for that the initial transaction reservation must account for the total number of block allocations performed by the chain of deferred operations. If not, a block reservation overrun occurs and the filesystem shuts down. Update the rmapbt extent swap block reservation to check for this situation and add some block reservation slop to ensure the entire operation succeeds. We'd never likely require reservation for both inodes as fsr wouldn't defrag the file in that case, but the additional reservation is constrained by the data fork size so be cautious and check for both. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index c83f549dc17b..e0a442f504e5 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1899,17 +1899,28 @@ xfs_swap_extents( * performed with log redo items! */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + int w = XFS_DATA_FORK; + uint32_t ipnext = XFS_IFORK_NEXTENTS(ip, w); + uint32_t tipnext = XFS_IFORK_NEXTENTS(tip, w); + + /* + * Conceptually this shouldn't affect the shape of either bmbt, + * but since we atomically move extents one by one, we reserve + * enough space to rebuild both trees. + */ + resblks = XFS_SWAP_RMAP_SPACE_RES(mp, ipnext, w); + resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w); + /* - * Conceptually this shouldn't affect the shape of either - * bmbt, but since we atomically move extents one by one, - * we reserve enough space to rebuild both trees. + * Handle the corner case where either inode might straddle the + * btree format boundary. If so, the inode could bounce between + * btree <-> extent format on unmap -> remap cycles, freeing and + * allocating a bmapbt block each time. */ - resblks = XFS_SWAP_RMAP_SPACE_RES(mp, - XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK), - XFS_DATA_FORK) + - XFS_SWAP_RMAP_SPACE_RES(mp, - XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), - XFS_DATA_FORK); + if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1)) + resblks += XFS_IFORK_MAXEXT(ip, w); + if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1)) + resblks += XFS_IFORK_MAXEXT(tip, w); } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) -- cgit v1.2.3 From 215928633502a7296fec42614463bb49859787d6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 9 Mar 2018 14:01:59 -0800 Subject: xfs: rename agfl perag res type to rmapbt The AGFL perag reservation type accounts all allocations that feed into (or are released from) the allocation group free list (agfl). The purpose of the reservation is to support worst case conditions for the reverse mapping btree (rmapbt). As such, the agfl reservation usage accounting only considers rmapbt usage when the in-core counters are initialized at mount time. This implementation inconsistency leads to divergence of the in-core and on-disk usage accounting over time. In preparation to resolve this inconsistency and adjust the AGFL reservation into an rmapbt specific reservation, rename the AGFL reservation type and associated accounting fields to something more rmapbt-specific. Also fix up a couple tracepoints that incorrectly use the AGFL reservation type to pass the agfl state of the associated extent where the raw reservation type is expected. Note that this patch does not change perag reservation behavior. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag_resv.c | 35 ++++++++++++++++++----------------- fs/xfs/libxfs/xfs_alloc.c | 20 +++++++++----------- fs/xfs/xfs_mount.h | 10 +++++----- fs/xfs/xfs_reflink.c | 2 +- 4 files changed, 33 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 2291f4224e24..0ca2e680034a 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -95,13 +95,13 @@ xfs_ag_resv_critical( switch (type) { case XFS_AG_RESV_METADATA: - avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved; + avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved; orig = pag->pag_meta_resv.ar_asked; break; - case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: avail = pag->pagf_freeblks + pag->pagf_flcount - pag->pag_meta_resv.ar_reserved; - orig = pag->pag_agfl_resv.ar_asked; + orig = pag->pag_rmapbt_resv.ar_asked; break; default: ASSERT(0); @@ -126,10 +126,10 @@ xfs_ag_resv_needed( { xfs_extlen_t len; - len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved; + len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved; switch (type) { case XFS_AG_RESV_METADATA: - case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: len -= xfs_perag_resv(pag, type)->ar_reserved; break; case XFS_AG_RESV_NONE: @@ -160,10 +160,11 @@ __xfs_ag_resv_free( if (pag->pag_agno == 0) pag->pag_mount->m_ag_max_usable += resv->ar_asked; /* - * AGFL blocks are always considered "free", so whatever - * was reserved at mount time must be given back at umount. + * RMAPBT blocks come from the AGFL and AGFL blocks are always + * considered "free", so whatever was reserved at mount time must be + * given back at umount. */ - if (type == XFS_AG_RESV_AGFL) + if (type == XFS_AG_RESV_RMAPBT) oldresv = resv->ar_orig_reserved; else oldresv = resv->ar_reserved; @@ -185,7 +186,7 @@ xfs_ag_resv_free( int error; int err2; - error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL); + error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); if (err2 && !error) error = err2; @@ -284,15 +285,15 @@ xfs_ag_resv_init( } } - /* Create the AGFL metadata reservation */ - if (pag->pag_agfl_resv.ar_asked == 0) { + /* Create the RMAPBT metadata reservation */ + if (pag->pag_rmapbt_resv.ar_asked == 0) { ask = used = 0; error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); if (error) goto out; } @@ -304,7 +305,7 @@ xfs_ag_resv_init( return error; ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + - xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= + xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= pag->pagf_freeblks + pag->pagf_flcount); #endif out: @@ -326,7 +327,7 @@ xfs_ag_resv_alloc_extent( switch (type) { case XFS_AG_RESV_METADATA: - case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: resv = xfs_perag_resv(pag, type); break; default: @@ -341,7 +342,7 @@ xfs_ag_resv_alloc_extent( len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); resv->ar_reserved -= len; - if (type == XFS_AG_RESV_AGFL) + if (type == XFS_AG_RESV_RMAPBT) return; /* Allocations of reserved blocks only need on-disk sb updates... */ xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); @@ -366,7 +367,7 @@ xfs_ag_resv_free_extent( switch (type) { case XFS_AG_RESV_METADATA: - case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: resv = xfs_perag_resv(pag, type); break; default: @@ -379,7 +380,7 @@ xfs_ag_resv_free_extent( leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); resv->ar_reserved += leftover; - if (type == XFS_AG_RESV_AGFL) + if (type == XFS_AG_RESV_RMAPBT) return; /* Freeing into the reserved pool only requires on-disk update... */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 36101e55ed07..1dc244d15a75 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -728,7 +728,7 @@ xfs_alloc_ag_vextent( ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); + ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_RMAPBT); ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ @@ -1600,7 +1600,7 @@ xfs_alloc_ag_vextent_small( * freelist. */ else if (args->minlen == 1 && args->alignment == 1 && - args->resv != XFS_AG_RESV_AGFL && + args->resv != XFS_AG_RESV_RMAPBT && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1634,7 +1634,7 @@ xfs_alloc_ag_vextent_small( * If we're feeding an AGFL block to something that * doesn't live in the free space, we need to clear * out the OWN_AG rmap and add the block back to - * the AGFL per-AG reservation. + * the RMAPBT per-AG reservation. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, @@ -1642,7 +1642,7 @@ xfs_alloc_ag_vextent_small( if (error) goto error0; pag = xfs_perag_get(args->mp, args->agno); - xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL, + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, args->tp, 1); xfs_perag_put(pag); @@ -1928,14 +1928,12 @@ xfs_free_ag_extent( XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, - haveleft, haveright); + trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, - -1, -1); + trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -2172,7 +2170,7 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, XFS_AG_RESV_AGFL); + &targs.oinfo, XFS_AG_RESV_RMAPBT); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); @@ -2198,7 +2196,7 @@ xfs_alloc_fix_freelist( while (pag->pagf_flcount < need) { targs.agbno = 0; targs.maxlen = need - pag->pagf_flcount; - targs.resv = XFS_AG_RESV_AGFL; + targs.resv = XFS_AG_RESV_RMAPBT; /* Allocate as many blocks as possible at once. */ error = xfs_alloc_ag_vextent(&targs); @@ -2879,7 +2877,7 @@ xfs_free_extent( int error; ASSERT(len != 0); - ASSERT(type != XFS_AG_RESV_AGFL); + ASSERT(type != XFS_AG_RESV_RMAPBT); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 44a63c5f43c8..a2cf3718bea9 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -326,7 +326,7 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, XFS_AG_RESV_METADATA, - XFS_AG_RESV_AGFL, + XFS_AG_RESV_RMAPBT, }; struct xfs_ag_resv { @@ -390,8 +390,8 @@ typedef struct xfs_perag { /* Blocks reserved for all kinds of metadata. */ struct xfs_ag_resv pag_meta_resv; - /* Blocks reserved for just AGFL-based metadata. */ - struct xfs_ag_resv pag_agfl_resv; + /* Blocks reserved for the reverse mapping btree. */ + struct xfs_ag_resv pag_rmapbt_resv; /* reference count */ uint8_t pagf_refcount_level; @@ -405,8 +405,8 @@ xfs_perag_resv( switch (type) { case XFS_AG_RESV_METADATA: return &pag->pag_meta_resv; - case XFS_AG_RESV_AGFL: - return &pag->pag_agfl_resv; + case XFS_AG_RESV_RMAPBT: + return &pag->pag_rmapbt_resv; default: return NULL; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 8c16177b33d4..d81c4f868d69 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1061,7 +1061,7 @@ xfs_reflink_ag_has_free_space( return 0; pag = xfs_perag_get(mp, agno); - if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) || + if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) || xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA)) error = -ENOSPC; xfs_perag_put(pag); -- cgit v1.2.3 From 0ab32086d0becee56c75a8ba21f16ac08b80f304 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 9 Mar 2018 14:02:32 -0800 Subject: xfs: account only rmapbt-used blocks against rmapbt perag res The rmapbt perag metadata reservation reserves blocks for the reverse mapping btree (rmapbt). Since the rmapbt uses blocks from the agfl and perag accounting is updated as blocks are allocated from the allocation btrees, the reservation actually accounts blocks as they are allocated to (or freed from) the agfl rather than the rmapbt itself. While this works for blocks that are eventually used for the rmapbt, not all agfl blocks are destined for the rmapbt. Blocks that are allocated to the agfl (and thus "reserved" for the rmapbt) but then used by another structure leads to a growing inconsistency over time between the runtime tracking of rmapbt usage vs. actual rmapbt usage. Since the runtime tracking thinks all agfl blocks are rmapbt blocks, it essentially believes that less future reservation is required to satisfy the rmapbt than what is actually necessary. The inconsistency is rectified across mount cycles because the perag reservation is initialized based on the actual rmapbt usage at mount time. The problem, however, is that the excessive drain of the reservation at runtime opens a window to allocate blocks for other purposes that might be required for the rmapbt on a subsequent mount. This problem can be demonstrated by a simple test that runs an allocation workload to consume agfl blocks over time and then observe the difference in the agfl reservation requirement across an unmount/mount cycle: mount ...: xfs_ag_resv_init: ... resv 3193 ask 3194 len 3194 ... ... : xfs_ag_resv_alloc_extent: ... resv 2957 ask 3194 len 1 umount...: xfs_ag_resv_free: ... resv 2956 ask 3194 len 0 mount ...: xfs_ag_resv_init: ... resv 3052 ask 3194 len 3194 As the above tracepoints show, the reservation requirement reduces from 3194 blocks to 2956 blocks as the workload runs. Without any other changes in the filesystem, the same reservation requirement jumps from 2956 to 3052 blocks over a umount/mount cycle. To address this divergence, update the RMAPBT reservation to account blocks used for the rmapbt only rather than all blocks filled into the agfl. This patch makes several high-level changes toward that end: 1.) Reintroduce an AGFL reservation type to serve as an accounting no-op for blocks allocated to (or freed from) the AGFL. 2.) Invoke RMAPBT usage accounting from the actual rmapbt block allocation path rather than the AGFL allocation path. The first change is required because agfl blocks are considered free blocks throughout their lifetime. The perag reservation subsystem is invoked unconditionally by the allocation subsystem, so we need a way to tell the perag subsystem (via the allocation subsystem) to not make any accounting changes for blocks filled into the AGFL. The second change causes the in-core RMAPBT reservation usage accounting to remain consistent with the on-disk state at all times and eliminates the risk of leaving the rmapbt reservation underfilled. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ag_resv.c | 4 ++++ fs/xfs/libxfs/xfs_ag_resv.h | 31 +++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_alloc.c | 18 ++++++------------ fs/xfs/libxfs/xfs_rmap_btree.c | 4 ++++ fs/xfs/xfs_mount.h | 1 + 5 files changed, 46 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 0ca2e680034a..03885a968de8 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -326,6 +326,8 @@ xfs_ag_resv_alloc_extent( trace_xfs_ag_resv_alloc_extent(pag, type, args->len); switch (type) { + case XFS_AG_RESV_AGFL: + return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: resv = xfs_perag_resv(pag, type); @@ -366,6 +368,8 @@ xfs_ag_resv_free_extent( trace_xfs_ag_resv_free_extent(pag, type, len); switch (type) { + case XFS_AG_RESV_AGFL: + return; case XFS_AG_RESV_METADATA: case XFS_AG_RESV_RMAPBT: resv = xfs_perag_resv(pag, type); diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h index 8d6c687deef3..938f2f96c5e8 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.h +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -32,4 +32,35 @@ void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, struct xfs_trans *tp, xfs_extlen_t len); +/* + * RMAPBT reservation accounting wrappers. Since rmapbt blocks are sourced from + * the AGFL, they are allocated one at a time and the reservation updates don't + * require a transaction. + */ +static inline void +xfs_ag_resv_rmapbt_alloc( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_alloc_arg args = {0}; + struct xfs_perag *pag; + + args.len = 1; + pag = xfs_perag_get(mp, agno); + xfs_ag_resv_alloc_extent(pag, XFS_AG_RESV_RMAPBT, &args); + xfs_perag_put(pag); +} + +static inline void +xfs_ag_resv_rmapbt_free( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1); + xfs_perag_put(pag); +} + #endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 1dc244d15a75..3db90b707fb2 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -728,7 +728,7 @@ xfs_alloc_ag_vextent( ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_RMAPBT); + ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ @@ -1581,7 +1581,6 @@ xfs_alloc_ag_vextent_small( int *stat) /* status: 0-freelist, 1-normal/none */ { struct xfs_owner_info oinfo; - struct xfs_perag *pag; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1600,7 +1599,7 @@ xfs_alloc_ag_vextent_small( * freelist. */ else if (args->minlen == 1 && args->alignment == 1 && - args->resv != XFS_AG_RESV_RMAPBT && + args->resv != XFS_AG_RESV_AGFL && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1633,18 +1632,13 @@ xfs_alloc_ag_vextent_small( /* * If we're feeding an AGFL block to something that * doesn't live in the free space, we need to clear - * out the OWN_AG rmap and add the block back to - * the RMAPBT per-AG reservation. + * out the OWN_AG rmap. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, &oinfo); if (error) goto error0; - pag = xfs_perag_get(args->mp, args->agno); - xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, - args->tp, 1); - xfs_perag_put(pag); *stat = 0; return 0; @@ -2170,7 +2164,7 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, XFS_AG_RESV_RMAPBT); + &targs.oinfo, XFS_AG_RESV_AGFL); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); @@ -2196,7 +2190,7 @@ xfs_alloc_fix_freelist( while (pag->pagf_flcount < need) { targs.agbno = 0; targs.maxlen = need - pag->pagf_flcount; - targs.resv = XFS_AG_RESV_RMAPBT; + targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ error = xfs_alloc_ag_vextent(&targs); @@ -2877,7 +2871,7 @@ xfs_free_extent( int error; ASSERT(len != 0); - ASSERT(type != XFS_AG_RESV_RMAPBT); + ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT)) diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 738df3f9b5f2..8b0d0de1cd11 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -125,6 +125,8 @@ xfs_rmapbt_alloc_block( be32_add_cpu(&agf->agf_rmap_blocks, 1); xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS); + xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno); + *stat = 1; return 0; } @@ -152,6 +154,8 @@ xfs_rmapbt_free_block( XFS_EXTENT_BUSY_SKIP_DISCARD); xfs_trans_agbtree_delta(cur->bc_tp, -1); + xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno); + return 0; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a2cf3718bea9..1808f56decaa 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -325,6 +325,7 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) /* per-AG block reservation data structures*/ enum xfs_ag_resv_type { XFS_AG_RESV_NONE = 0, + XFS_AG_RESV_AGFL, XFS_AG_RESV_METADATA, XFS_AG_RESV_RMAPBT, }; -- cgit v1.2.3 From 2022ab36fefc8cb6483b767e3ff9d099087c52d9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:26 -0700 Subject: xfs: remove misleading comment text on xfs_inode_item_unlock Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 7666eae8844f..35c1d3772a9c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -579,9 +579,6 @@ out_unlock: /* * Unlock the inode associated with the inode log item. - * Clear the fields of the inode and inode log item that - * are specific to the current transaction. If the - * hold flags is set, do not unlock the inode. */ STATIC void xfs_inode_item_unlock( -- cgit v1.2.3 From 4f7aa2fd8c94d621e62131f3d6506a88ad9b0926 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:27 -0700 Subject: xfs: remove an outdated comment for xfs_inode_item_committing The function now does something, and that something is central to our inode logging scheme. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode_item.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 35c1d3772a9c..34b91b789702 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -634,10 +634,6 @@ xfs_inode_item_committed( return lsn; } -/* - * XXX rcc - this one really has to do something. Probably needs - * to stamp in a new field in the incore inode. - */ STATIC void xfs_inode_item_committing( struct xfs_log_item *lip, -- cgit v1.2.3 From 2b56c2857fde3c7f74174dd2fe3d7b8833d82744 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:27 -0700 Subject: xfs: remove the unused log_flushed variable in xfs_extent_busy_flush Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_extent_busy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 77760dbf0242..1b439db95634 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -611,10 +611,10 @@ xfs_extent_busy_flush( unsigned busy_gen) { DEFINE_WAIT (wait); - int log_flushed = 0, error; + int error; trace_xfs_log_force(mp, 0, _THIS_IP_); - error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed); + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); if (error) return; -- cgit v1.2.3 From 60e5bb7844ec75a2f54ea76d8ceec5c79172ce7e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:28 -0700 Subject: xfs: merge _xfs_log_force and xfs_log_force Switch to a single interface for flushing the whole log, which gives consistent trace point coverage, and removes the unused log_flushed argument for the previous _xfs_log_force callers. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/scrub/common.c | 2 +- fs/xfs/xfs_extent_busy.c | 3 +-- fs/xfs/xfs_log.c | 26 +++++--------------------- fs/xfs/xfs_log.h | 6 +----- 4 files changed, 8 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8033ab9d8f47..ddcdda336402 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -619,7 +619,7 @@ xfs_scrub_checkpoint_log( { int error; - error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + error = xfs_log_force(mp, XFS_LOG_SYNC); if (error) return error; xfs_ail_push_all_sync(mp->m_ail); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 1b439db95634..13e3d1a69e76 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -613,8 +613,7 @@ xfs_extent_busy_flush( DEFINE_WAIT (wait); int error; - trace_xfs_log_force(mp, 0, _THIS_IP_); - error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + error = xfs_log_force(mp, XFS_LOG_SYNC); if (error) return; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2f529fd7bf67..3ce17e79b6e0 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -869,7 +869,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) return 0; } - error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + error = xfs_log_force(mp, XFS_LOG_SYNC); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); #ifdef DEBUG @@ -3304,16 +3304,16 @@ xlog_state_switch_iclogs( * not in the active nor dirty state. */ int -_xfs_log_force( +xfs_log_force( struct xfs_mount *mp, - uint flags, - int *log_flushed) + uint flags) { struct xlog *log = mp->m_log; struct xlog_in_core *iclog; xfs_lsn_t lsn; XFS_STATS_INC(mp, xs_log_force); + trace_xfs_log_force(mp, 0, _RET_IP_); xlog_cil_force(log); @@ -3362,8 +3362,6 @@ _xfs_log_force( if (xlog_state_release_iclog(log, iclog)) return -EIO; - if (log_flushed) - *log_flushed = 1; spin_lock(&log->l_icloglock); if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && iclog->ic_state != XLOG_STATE_DIRTY) @@ -3415,20 +3413,6 @@ no_sleep: return 0; } -/* - * Wrapper for _xfs_log_force(), to be used when caller doesn't care - * about errors or whether the log was flushed or not. This is the normal - * interface to use when trying to unpin items or move the log forward. - */ -void -xfs_log_force( - xfs_mount_t *mp, - uint flags) -{ - trace_xfs_log_force(mp, 0, _RET_IP_); - _xfs_log_force(mp, flags, NULL); -} - /* * Force the in-core log to disk for a specific LSN. * @@ -4035,7 +4019,7 @@ xfs_log_force_umount( * to guarantee this. */ if (!logerror) - _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + xfs_log_force(mp, XFS_LOG_SYNC); /* * mark the filesystem and the as in a shutdown state and wake diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index bf212772595c..726dd9a330b4 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -129,11 +129,7 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, bool regrant); -int _xfs_log_force(struct xfs_mount *mp, - uint flags, - int *log_forced); -void xfs_log_force(struct xfs_mount *mp, - uint flags); +int xfs_log_force(struct xfs_mount *mp, uint flags); int _xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, -- cgit v1.2.3 From 656de4ffaffd921e1b45de4150c86ba50da135e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:28 -0700 Subject: xfs: merge _xfs_log_force_lsn and xfs_log_force_lsn Switch to a single interface for flushing the log to a specific LSN, which gives consistent trace point coverage and a less confusing interface. The was only a single user of the previous xfs_log_force_lsn function, which now also passes a NULL log_flushed argument. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_export.c | 2 +- fs/xfs/xfs_file.c | 4 ++-- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_log.c | 18 ++---------------- fs/xfs/xfs_log.h | 9 ++------- fs/xfs/xfs_trans.c | 2 +- 6 files changed, 9 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index fe1bfee35898..761f3189eff2 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9ea08326f876..f5c5dbbf1792 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -122,7 +122,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int @@ -182,7 +182,7 @@ xfs_file_fsync( } if (lsn) { - error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); ip->i_itemp->ili_fsync_fields = 0; } xfs_iunlock(ip, XFS_ILOCK_SHARED); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 50fbbf5f5a7a..09ba970c07b1 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2502,7 +2502,7 @@ xfs_iunpin( trace_xfs_inode_unpin_nowait(ip, _RET_IP_); /* Give the log a push to start the unpinning I/O */ - xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3ce17e79b6e0..cb50dd72a3f3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3429,7 +3429,7 @@ no_sleep: * sv. */ int -_xfs_log_force_lsn( +xfs_log_force_lsn( struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, @@ -3442,6 +3442,7 @@ _xfs_log_force_lsn( ASSERT(lsn != 0); XFS_STATS_INC(mp, xs_log_force); + trace_xfs_log_force(mp, lsn, _RET_IP_); lsn = xlog_cil_force_lsn(log, lsn); if (lsn == NULLCOMMITLSN) @@ -3538,21 +3539,6 @@ try_again: return 0; } -/* - * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care - * about errors or whether the log was flushed or not. This is the normal - * interface to use when trying to unpin items or move the log forward. - */ -void -xfs_log_force_lsn( - xfs_mount_t *mp, - xfs_lsn_t lsn, - uint flags) -{ - trace_xfs_log_force(mp, lsn, _RET_IP_); - _xfs_log_force_lsn(mp, lsn, flags, NULL); -} - /* * Called when we want to mark the current iclog as being ready to sync to * disk. diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 726dd9a330b4..7e2d62922a16 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -130,13 +130,8 @@ xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_in_core **iclog, bool regrant); int xfs_log_force(struct xfs_mount *mp, uint flags); -int _xfs_log_force_lsn(struct xfs_mount *mp, - xfs_lsn_t lsn, - uint flags, - int *log_forced); -void xfs_log_force_lsn(struct xfs_mount *mp, - xfs_lsn_t lsn, - uint flags); +int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, + int *log_forced); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2ad08ec2fc21..d6d8f9d129a7 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -970,7 +970,7 @@ __xfs_trans_commit( * log out now and wait for it. */ if (sync) { - error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); + error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); XFS_STATS_INC(mp, xs_trans_sync); } else { XFS_STATS_INC(mp, xs_trans_async); -- cgit v1.2.3 From e6b965705685c7fc2f24a68410529f86c08c7277 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:29 -0700 Subject: xfs: refactor xfs_log_force Streamline the conditionals so that it is more obvious which specific case form the top of the function comments is being handled. Use gotos only for early returns. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 144 ++++++++++++++++++++++++------------------------------- 1 file changed, 63 insertions(+), 81 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index cb50dd72a3f3..2a0f882e7f7e 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3318,99 +3318,81 @@ xfs_log_force( xlog_cil_force(log); spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } + if (iclog->ic_state & XLOG_STATE_IOERROR) + goto out_error; - /* If the head iclog is not active nor dirty, we just attach - * ourselves to the head and go to sleep. - */ - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) { + if (iclog->ic_state == XLOG_STATE_DIRTY || + (iclog->ic_state == XLOG_STATE_ACTIVE && + atomic_read(&iclog->ic_refcnt) == 0 && iclog->ic_offset == 0)) { /* - * If the head is dirty or (active and empty), then - * we need to look at the previous iclog. If the previous - * iclog is active or dirty we are done. There is nothing - * to sync out. Otherwise, we attach ourselves to the + * If the head is dirty or (active and empty), then we need to + * look at the previous iclog. + * + * If the previous iclog is active or dirty we are done. There + * is nothing to sync out. Otherwise, we attach ourselves to the * previous iclog and go to sleep. */ - if (iclog->ic_state == XLOG_STATE_DIRTY || - (atomic_read(&iclog->ic_refcnt) == 0 - && iclog->ic_offset == 0)) { - iclog = iclog->ic_prev; - if (iclog->ic_state == XLOG_STATE_ACTIVE || - iclog->ic_state == XLOG_STATE_DIRTY) - goto no_sleep; - else - goto maybe_sleep; - } else { - if (atomic_read(&iclog->ic_refcnt) == 0) { - /* We are the only one with access to this - * iclog. Flush it out now. There should - * be a roundoff of zero to show that someone - * has already taken care of the roundoff from - * the previous sync. - */ - atomic_inc(&iclog->ic_refcnt); - lsn = be64_to_cpu(iclog->ic_header.h_lsn); - xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); - - if (xlog_state_release_iclog(log, iclog)) - return -EIO; + iclog = iclog->ic_prev; + if (iclog->ic_state == XLOG_STATE_ACTIVE || + iclog->ic_state == XLOG_STATE_DIRTY) + goto out_unlock; + } else if (iclog->ic_state == XLOG_STATE_ACTIVE) { + if (atomic_read(&iclog->ic_refcnt) == 0) { + /* + * We are the only one with access to this iclog. + * + * Flush it out now. There should be a roundoff of zero + * to show that someone has already taken care of the + * roundoff from the previous sync. + */ + atomic_inc(&iclog->ic_refcnt); + lsn = be64_to_cpu(iclog->ic_header.h_lsn); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); - spin_lock(&log->l_icloglock); - if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && - iclog->ic_state != XLOG_STATE_DIRTY) - goto maybe_sleep; - else - goto no_sleep; - } else { - /* Someone else is writing to this iclog. - * Use its call to flush out the data. However, - * the other thread may not force out this LR, - * so we mark it WANT_SYNC. - */ - xlog_state_switch_iclogs(log, iclog, 0); - goto maybe_sleep; - } - } - } + if (xlog_state_release_iclog(log, iclog)) + return -EIO; - /* By the time we come around again, the iclog could've been filled - * which would give it another lsn. If we have a new lsn, just - * return because the relevant data has been flushed. - */ -maybe_sleep: - if (flags & XFS_LOG_SYNC) { - /* - * We must check if we're shutting down here, before - * we wait, while we're holding the l_icloglock. - * Then we check again after waking up, in case our - * sleep was disturbed by a bad news. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; + spin_lock(&log->l_icloglock); + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn || + iclog->ic_state == XLOG_STATE_DIRTY) + goto out_unlock; + } else { + /* + * Someone else is writing to this iclog. + * + * Use its call to flush out the data. However, the + * other thread may not force out this LR, so we mark + * it WANT_SYNC. + */ + xlog_state_switch_iclogs(log, iclog, 0); } - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + } else { /* - * No need to grab the log lock here since we're - * only deciding whether or not to return EIO - * and the memory read should be atomic. + * If the head iclog is not active nor dirty, we just attach + * ourselves to the head and go to sleep if necessary. */ - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - } else { - -no_sleep: - spin_unlock(&log->l_icloglock); + ; } + + if (!(flags & XFS_LOG_SYNC)) + goto out_unlock; + + if (iclog->ic_state & XLOG_STATE_IOERROR) + goto out_error; + XFS_STATS_INC(mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; return 0; + +out_unlock: + spin_unlock(&log->l_icloglock); + return 0; +out_error: + spin_unlock(&log->l_icloglock); + return -EIO; } /* -- cgit v1.2.3 From 5bcffe300ca708203ca926c17f7a509d4a01403d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:30 -0700 Subject: xfs: fix the check for COW extents in xfs_swap_extents i_cnextents does not include delayed allocated extents, so switch to the inode fork size check that we already use in other places instead. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index e0a442f504e5..19ea7d086cf8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -2014,11 +2014,11 @@ xfs_swap_extents( ip->i_cowfp = tip->i_cowfp; tip->i_cowfp = cowfp; - if (ip->i_cowfp && ip->i_cnextents) + if (ip->i_cowfp && ip->i_cowfp->if_bytes) xfs_inode_set_cowblocks_tag(ip); else xfs_inode_clear_cowblocks_tag(ip); - if (tip->i_cowfp && tip->i_cnextents) + if (tip->i_cowfp && tip->i_cowfp->if_bytes) xfs_inode_set_cowblocks_tag(tip); else xfs_inode_clear_cowblocks_tag(tip); -- cgit v1.2.3 From 7d9df3c1631b0cb7eb4e59c934271adf0ead2f55 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:31 -0700 Subject: xfs: don't use XFS_BMAPI_ENTRIRE in xfs_get_blocks There is no reason to get a mapping bigger than what we were asked for. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a0afb6411417..c79a3ca20ef8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1331,8 +1331,8 @@ xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, 0); if (error) goto out_unlock; -- cgit v1.2.3 From c7dbe3f2c41969845b8ea000fc5e025d4987a8fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:31 -0700 Subject: xfs: assert that xfs_reflink_allocate_cow is called with XFS_ILOCK_EXCL Now that we convert COW preallocations from unwritten to real on every call this function needs to be called with the ilock held exclusively. Fortunately we already do that, but update the assert to match. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d81c4f868d69..90aac8889dd9 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -394,7 +394,7 @@ xfs_reflink_allocate_cow( retry: ASSERT(xfs_is_reflink_inode(ip)); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* * Even if the extent is not shared we might have a preallocation for -- cgit v1.2.3 From f5c54717bf2b9e052bb69d6ee19fe22e87817079 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:32 -0700 Subject: xfs: remove xfs_zero_range This helper doesn't add any real value over just calling iomap_zero_range directly, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_bmap_util.c | 11 ++++------- fs/xfs/xfs_file.c | 48 +++++++----------------------------------------- fs/xfs/xfs_inode.h | 4 ---- fs/xfs/xfs_iops.c | 4 +++- 4 files changed, 14 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 19ea7d086cf8..05dee8fdd895 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1208,18 +1208,15 @@ xfs_free_file_space( /* * Now that we've unmap all full blocks we'll have to zero out any - * partial block at the beginning and/or end. xfs_zero_range is - * smart enough to skip any holes, including those we just created, - * but we must take care not to zero beyond EOF and enlarge i_size. + * partial block at the beginning and/or end. iomap_zero_range is smart + * enough to skip any holes, including those we just created, but we + * must take care not to zero beyond EOF and enlarge i_size. */ - if (offset >= XFS_ISIZE(ip)) return 0; - if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - - return xfs_zero_range(ip, offset, len, NULL); + return iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f5c5dbbf1792..299aee4b7b0b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -48,20 +48,6 @@ static const struct vm_operations_struct xfs_file_vm_ops; -/* - * Clear the specified ranges to zero through either the pagecache or DAX. - * Holes and unwritten extents will be left as-is as they already are zeroed. - */ -int -xfs_zero_range( - struct xfs_inode *ip, - xfs_off_t pos, - xfs_off_t count, - bool *did_zero) -{ - return iomap_zero_range(VFS_I(ip), pos, count, did_zero, &xfs_iomap_ops); -} - int xfs_update_prealloc_flags( struct xfs_inode *ip, @@ -300,31 +286,6 @@ xfs_file_read_iter( return ret; } -/* - * Zero any on disk space between the current EOF and the new, larger EOF. - * - * This handles the normal case of zeroing the remainder of the last block in - * the file and the unusual case of zeroing blocks out beyond the size of the - * file. This second case only happens with fixed size extents and when the - * system crashes before the inode size was updated but after blocks were - * allocated. - * - * Expects the iolock to be held exclusive, and will take the ilock internally. - */ -int /* error (positive) */ -xfs_zero_eof( - struct xfs_inode *ip, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize, /* current inode size */ - bool *did_zeroing) -{ - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT(offset > isize); - - trace_xfs_zero_eof(ip, isize, offset - isize); - return xfs_zero_range(ip, isize, offset - isize, did_zeroing); -} - /* * Common pre-write limit and setup checks. * @@ -344,6 +305,7 @@ xfs_file_aio_write_checks( ssize_t error = 0; size_t count = iov_iter_count(from); bool drained_dio = false; + loff_t isize; restart: error = generic_write_checks(iocb, from); @@ -380,7 +342,8 @@ restart: * and hence be able to correctly determine if we need to run zeroing. */ spin_lock(&ip->i_flags_lock); - if (iocb->ki_pos > i_size_read(inode)) { + isize = i_size_read(inode); + if (iocb->ki_pos > isize) { spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { @@ -401,7 +364,10 @@ restart: drained_dio = true; goto restart; } - error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), NULL); + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + error = iomap_zero_range(inode, isize, iocb->ki_pos - isize, + NULL, &xfs_iomap_ops); if (error) return error; } else diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3e8dc990d41c..132d8aa2afc4 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -443,10 +443,6 @@ enum xfs_prealloc_flags { int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); -int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, - xfs_fsize_t isize, bool *did_zeroing); -int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count, - bool *did_zero); /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8567951eff10..e0307fbff911 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -875,7 +875,9 @@ xfs_setattr_size( * truncate. */ if (newsize > oldsize) { - error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); + trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); + error = iomap_zero_range(inode, oldsize, newsize - oldsize, + &did_zeroing, &xfs_iomap_ops); } else { error = iomap_truncate_page(inode, newsize, &did_zeroing, &xfs_iomap_ops); -- cgit v1.2.3 From 1d4352de511f4c9b6f4466c8fd4cfc25b14869b7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:32 -0700 Subject: xfs: minor cleanup for xfs_get_blocks Simplify the control flow a bit in preparation for O_ATOMIC-related changes. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_aops.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index c79a3ca20ef8..19eadc807056 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1335,17 +1335,16 @@ xfs_get_blocks( &nimaps, 0); if (error) goto out_unlock; - - if (nimaps) { - trace_xfs_get_blocks_found(ip, offset, size, - imap.br_state == XFS_EXT_UNWRITTEN ? - XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); - xfs_iunlock(ip, lockmode); - } else { + if (!nimaps) { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } + trace_xfs_get_blocks_found(ip, offset, size, + imap.br_state == XFS_EXT_UNWRITTEN ? + XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); + xfs_iunlock(ip, lockmode); + /* trim mapping down to size requested */ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); -- cgit v1.2.3 From df79b81b2e6a801c134c3e06c647715a316b4a92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:33 -0700 Subject: xfs: minor cleanup for xfs_reflink_end_cow Use xfs_iext_prev_extent to skip to the previous extent instead of opencoding it. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_reflink.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 90aac8889dd9..cdbd342a5249 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -762,10 +762,8 @@ xfs_reflink_end_cow( xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb); /* Extent delete may have bumped ext forward */ - if (!del.br_blockcount) { - xfs_iext_prev(ifp, &icur); - goto next_extent; - } + if (!del.br_blockcount) + goto prev_extent; ASSERT(!isnullstartblock(got.br_startblock)); @@ -774,10 +772,8 @@ xfs_reflink_end_cow( * speculatively preallocated CoW extents that have been * allocated but have not yet been involved in a write. */ - if (got.br_state == XFS_EXT_UNWRITTEN) { - xfs_iext_prev(ifp, &icur); - goto next_extent; - } + if (got.br_state == XFS_EXT_UNWRITTEN) + goto prev_extent; /* Unmap the old blocks in the data fork. */ xfs_defer_init(&dfops, &firstfsb); @@ -816,9 +812,12 @@ xfs_reflink_end_cow( error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; -next_extent: if (!xfs_iext_get_extent(ifp, &icur, &got)) break; + continue; +prev_extent: + if (!xfs_iext_prev_extent(ifp, &icur, &got)) + break; } error = xfs_trans_commit(tp); -- cgit v1.2.3 From 93806299b56b22df8fec9b78cb4c9c41d283ee2a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:29 -0700 Subject: xfs: refactor xfs_log_force_lsn Use the the smallest possible loop as preable to find the correct iclog buffer, and then use gotos for unwinding to straighten the code. Also fix the top of function comment while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 142 ++++++++++++++++++++++++------------------------------- 1 file changed, 62 insertions(+), 80 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2a0f882e7f7e..6f8da674cbc6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3404,11 +3404,10 @@ out_error: * state and go to sleep or return. * If it is in any other state, go to sleep or return. * - * Synchronous forces are implemented with a signal variable. All callers - * to force a given lsn to disk will wait on a the sv attached to the - * specific in-core log. When given in-core log finally completes its - * write to disk, that thread will wake up all threads waiting on the - * sv. + * Synchronous forces are implemented with a wait queue. All callers trying + * to force a given lsn to disk must wait on the queue attached to the + * specific in-core log. When given in-core log finally completes its write + * to disk, that thread will wake up all threads waiting on the queue. */ int xfs_log_force_lsn( @@ -3433,92 +3432,75 @@ xfs_log_force_lsn( try_again: spin_lock(&log->l_icloglock); iclog = log->l_iclog; - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } + if (iclog->ic_state & XLOG_STATE_IOERROR) + goto out_error; - do { - if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { - iclog = iclog->ic_next; - continue; - } + while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + if (iclog == log->l_iclog) + goto out_unlock; + } - if (iclog->ic_state == XLOG_STATE_DIRTY) { - spin_unlock(&log->l_icloglock); - return 0; - } + if (iclog->ic_state == XLOG_STATE_DIRTY) + goto out_unlock; - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - /* - * We sleep here if we haven't already slept (e.g. - * this is the first time we've looked at the correct - * iclog buf) and the buffer before us is going to - * be sync'ed. The reason for this is that if we - * are doing sync transactions here, by waiting for - * the previous I/O to complete, we can allow a few - * more transactions into this iclog before we close - * it down. - * - * Otherwise, we mark the buffer WANT_SYNC, and bump - * up the refcnt so we can release the log (which - * drops the ref count). The state switch keeps new - * transaction commits from using this buffer. When - * the current commits finish writing into the buffer, - * the refcount will drop to zero and the buffer will - * go out then. - */ - if (!already_slept && - (iclog->ic_prev->ic_state & - (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { - ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. this is the + * first time we've looked at the correct iclog buf) and the + * buffer before us is going to be sync'ed. The reason for this + * is that if we are doing sync transactions here, by waiting + * for the previous I/O to complete, we can allow a few more + * transactions into this iclog before we close it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump up the + * refcnt so we can release the log (which drops the ref count). + * The state switch keeps new transaction commits from using + * this buffer. When the current commits finish writing into + * the buffer, the refcount will drop to zero and the buffer + * will go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & + (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - XFS_STATS_INC(mp, xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_prev->ic_write_wait, - &log->l_icloglock); - already_slept = 1; - goto try_again; - } - atomic_inc(&iclog->ic_refcnt); - xlog_state_switch_iclogs(log, iclog, 0); - spin_unlock(&log->l_icloglock); - if (xlog_state_release_iclog(log, iclog)) - return -EIO; - if (log_flushed) - *log_flushed = 1; - spin_lock(&log->l_icloglock); + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); + already_slept = 1; + goto try_again; } + atomic_inc(&iclog->ic_refcnt); + xlog_state_switch_iclogs(log, iclog, 0); + spin_unlock(&log->l_icloglock); + if (xlog_state_release_iclog(log, iclog)) + return -EIO; + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + } - if ((flags & XFS_LOG_SYNC) && /* sleep */ - !(iclog->ic_state & - (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { - /* - * Don't wait on completion if we know that we've - * gotten a log write error. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - return -EIO; - } - XFS_STATS_INC(mp, xs_log_force_sleep); - xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); - /* - * No need to grab the log lock here since we're - * only deciding whether or not to return EIO - * and the memory read should be atomic. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) - return -EIO; - } else { /* just return */ - spin_unlock(&log->l_icloglock); - } + if (!(flags & XFS_LOG_SYNC) || + (iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) + goto out_unlock; - return 0; - } while (iclog != log->l_iclog); + if (iclog->ic_state & XLOG_STATE_IOERROR) + goto out_error; + + XFS_STATS_INC(mp, xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + if (iclog->ic_state & XLOG_STATE_IOERROR) + return -EIO; + return 0; +out_unlock: spin_unlock(&log->l_icloglock); return 0; +out_error: + spin_unlock(&log->l_icloglock); + return -EIO; } /* -- cgit v1.2.3 From 3e4da466bfa1dea40b2b23130a9dc4acebcc9f14 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Mar 2018 23:15:30 -0700 Subject: xfs: unwind the try_again loop in xfs_log_force Instead split out a __xfs_log_fore_lsn helper that gets called again with the already_slept flag set to true in case we had to sleep. This prepares for aio_fsync support. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log.c | 72 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6f8da674cbc6..b9c9c848146b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3395,41 +3395,17 @@ out_error: return -EIO; } -/* - * Force the in-core log to disk for a specific LSN. - * - * Find in-core log with lsn. - * If it is in the DIRTY state, just return. - * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC - * state and go to sleep or return. - * If it is in any other state, go to sleep or return. - * - * Synchronous forces are implemented with a wait queue. All callers trying - * to force a given lsn to disk must wait on the queue attached to the - * specific in-core log. When given in-core log finally completes its write - * to disk, that thread will wake up all threads waiting on the queue. - */ -int -xfs_log_force_lsn( +static int +__xfs_log_force_lsn( struct xfs_mount *mp, xfs_lsn_t lsn, uint flags, - int *log_flushed) + int *log_flushed, + bool already_slept) { struct xlog *log = mp->m_log; struct xlog_in_core *iclog; - int already_slept = 0; - - ASSERT(lsn != 0); - - XFS_STATS_INC(mp, xs_log_force); - trace_xfs_log_force(mp, lsn, _RET_IP_); - - lsn = xlog_cil_force_lsn(log, lsn); - if (lsn == NULLCOMMITLSN) - return 0; -try_again: spin_lock(&log->l_icloglock); iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) @@ -3469,8 +3445,7 @@ try_again: xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); - already_slept = 1; - goto try_again; + return -EAGAIN; } atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); @@ -3503,6 +3478,43 @@ out_error: return -EIO; } +/* + * Force the in-core log to disk for a specific LSN. + * + * Find in-core log with lsn. + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a wait queue. All callers trying + * to force a given lsn to disk must wait on the queue attached to the + * specific in-core log. When given in-core log finally completes its write + * to disk, that thread will wake up all threads waiting on the queue. + */ +int +xfs_log_force_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) +{ + int ret; + ASSERT(lsn != 0); + + XFS_STATS_INC(mp, xs_log_force); + trace_xfs_log_force(mp, lsn, _RET_IP_); + + lsn = xlog_cil_force_lsn(mp->m_log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; + + ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false); + if (ret == -EAGAIN) + ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true); + return ret; +} + /* * Called when we want to mark the current iclog as being ready to sync to * disk. -- cgit v1.2.3 From a27ba2607e60312554cbcd43fc660b2c7f29dc9c Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 15 Mar 2018 10:51:58 -0700 Subject: xfs: detect agfl count corruption and reset agfl The struct xfs_agfl v5 header was originally introduced with unexpected padding that caused the AGFL to operate with one less slot than intended. The header has since been packed, but the fix left an incompatibility for users who upgrade from an old kernel with the unpacked header to a newer kernel with the packed header while the AGFL happens to wrap around the end. The newer kernel recognizes one extra slot at the physical end of the AGFL that the previous kernel did not. The new kernel will eventually attempt to allocate a block from that slot, which contains invalid data, and cause a crash. This condition can be detected by comparing the active range of the AGFL to the count. While this detects a padding mismatch, it can also trigger false positives for unrelated flcount corruption. Since we cannot distinguish a size mismatch due to padding from unrelated corruption, we can't trust the AGFL enough to simply repopulate the empty slot. Instead, avoid unnecessarily complex detection logic and and use a solution that can handle any form of flcount corruption that slips through read verifiers: distrust the entire AGFL and reset it to an empty state. Any valid blocks within the AGFL are intentionally leaked. This requires xfs_repair to rectify (which was already necessary based on the state the AGFL was found in). The reset mitigates the side effect of the padding mismatch problem from a filesystem crash to a free space accounting inconsistency. The generic approach also means that this patch can be safely backported to kernels with or without a packed struct xfs_agfl. Check the AGF for an invalid freelist count on initial read from disk. If detected, set a flag on the xfs_perag to indicate that a reset is required before the AGFL can be used. In the first transaction that attempts to use a flagged AGFL, reset it to empty, warn the user about the inconsistency and allow the freelist fixup code to repopulate the AGFL with new blocks. The xfs_perag flag is cleared to eliminate the need for repeated checks on each block allocation operation. This allows kernels that include the packing fix commit 96f859d52bcb ("libxfs: pack the agfl header structure so XFS_AGFL_SIZE is correct") to handle older unpacked AGFL formats without a filesystem crash. Suggested-by: Dave Chinner Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Reviewed-by Dave Chiluk Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_alloc.c | 94 +++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_trace.h | 9 ++++- 3 files changed, 103 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3db90b707fb2..39387bdd225d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2062,6 +2062,93 @@ xfs_alloc_space_available( return true; } +/* + * Check the agfl fields of the agf for inconsistency or corruption. The purpose + * is to detect an agfl header padding mismatch between current and early v5 + * kernels. This problem manifests as a 1-slot size difference between the + * on-disk flcount and the active [first, last] range of a wrapped agfl. This + * may also catch variants of agfl count corruption unrelated to padding. Either + * way, we'll reset the agfl and warn the user. + * + * Return true if a reset is required before the agfl can be used, false + * otherwise. + */ +static bool +xfs_agfl_needs_reset( + struct xfs_mount *mp, + struct xfs_agf *agf) +{ + uint32_t f = be32_to_cpu(agf->agf_flfirst); + uint32_t l = be32_to_cpu(agf->agf_fllast); + uint32_t c = be32_to_cpu(agf->agf_flcount); + int agfl_size = xfs_agfl_size(mp); + int active; + + /* no agfl header on v4 supers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + + /* + * The agf read verifier catches severe corruption of these fields. + * Repeat some sanity checks to cover a packed -> unpacked mismatch if + * the verifier allows it. + */ + if (f >= agfl_size || l >= agfl_size) + return true; + if (c > agfl_size) + return true; + + /* + * Check consistency between the on-disk count and the active range. An + * agfl padding mismatch manifests as an inconsistent flcount. + */ + if (c && l >= f) + active = l - f + 1; + else if (c) + active = agfl_size - f + l + 1; + else + active = 0; + + return active != c; +} + +/* + * Reset the agfl to an empty state. Ignore/drop any existing blocks since the + * agfl content cannot be trusted. Warn the user that a repair is required to + * recover leaked blocks. + * + * The purpose of this mechanism is to handle filesystems affected by the agfl + * header padding mismatch problem. A reset keeps the filesystem online with a + * relatively minor free space accounting inconsistency rather than suffer the + * inevitable crash from use of an invalid agfl block. + */ +static void +xfs_agfl_reset( + struct xfs_trans *tp, + struct xfs_buf *agbp, + struct xfs_perag *pag) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + + ASSERT(pag->pagf_agflreset); + trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); + + xfs_warn(mp, + "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " + "Please unmount and run xfs_repair.", + pag->pag_agno, pag->pagf_flcount); + + agf->agf_flfirst = 0; + agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); + agf->agf_flcount = 0; + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | + XFS_AGF_FLCOUNT); + + pag->pagf_flcount = 0; + pag->pagf_agflreset = false; +} + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. @@ -2123,6 +2210,10 @@ xfs_alloc_fix_freelist( } } + /* reset a padding mismatched agfl before final free space check */ + if (pag->pagf_agflreset) + xfs_agfl_reset(tp, agbp, pag); + /* If there isn't enough total space or single-extent, reject it. */ need = xfs_alloc_min_freelist(mp, pag); if (!xfs_alloc_space_available(args, need, flags)) @@ -2279,6 +2370,7 @@ xfs_alloc_get_freelist( agf->agf_flfirst = 0; pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; @@ -2390,6 +2482,7 @@ xfs_alloc_put_freelist( agf->agf_fllast = 0; pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + ASSERT(!pag->pagf_agflreset); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; @@ -2597,6 +2690,7 @@ xfs_alloc_read_agf( pag->pagb_count = 0; pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; + pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf); } #ifdef DEBUG else if (!XFS_FORCED_SHUTDOWN(mp)) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1808f56decaa..10b90bbc5162 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -353,6 +353,7 @@ typedef struct xfs_perag { char pagi_inodeok; /* The agi is ok for inodes */ uint8_t pagf_levels[XFS_BTNUM_AGF]; /* # of levels in bno & cnt btree */ + bool pagf_agflreset; /* agfl requires reset before use */ uint32_t pagf_flcount; /* count of blocks in freelist */ xfs_extlen_t pagf_freeblks; /* total free blocks */ xfs_extlen_t pagf_longest; /* longest free space */ diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 945de08af7ba..a982c0b623d0 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1477,7 +1477,7 @@ TRACE_EVENT(xfs_extent_busy_trim, __entry->tlen) ); -TRACE_EVENT(xfs_agf, +DECLARE_EVENT_CLASS(xfs_agf_class, TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, unsigned long caller_ip), TP_ARGS(mp, agf, flags, caller_ip), @@ -1533,6 +1533,13 @@ TRACE_EVENT(xfs_agf, __entry->longest, (void *)__entry->caller_ip) ); +#define DEFINE_AGF_EVENT(name) \ +DEFINE_EVENT(xfs_agf_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agf, flags, caller_ip)) +DEFINE_AGF_EVENT(xfs_agf); +DEFINE_AGF_EVENT(xfs_agfl_reset); TRACE_EVENT(xfs_free_extent, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, -- cgit v1.2.3 From 6915ef35c0350e87a104cb4c4ab2121c81ca7a34 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:51 -0700 Subject: xfs: sanity-check the unused space before trying to use it In xfs_dir2_data_use_free, we examine on-disk metadata and ASSERT if it doesn't make sense. Since a carefully crafted fuzzed image can cause the kernel to crash after blowing a bunch of assertions, let's move those checks into a validator function and rig everything up to return EFSCORRUPTED to userspace. Found by lastbit fuzzing ltail.bestcount via xfs/391. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_dir2.h | 2 +- fs/xfs/libxfs/xfs_dir2_block.c | 59 +++++++++++++++++++------------- fs/xfs/libxfs/xfs_dir2_data.c | 78 ++++++++++++++++++++++++++++++++---------- fs/xfs/libxfs/xfs_dir2_leaf.c | 10 ++++-- fs/xfs/libxfs/xfs_dir2_node.c | 11 ++++-- 5 files changed, 111 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index 388d67c5c903..989e95a53db2 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -173,7 +173,7 @@ extern void xfs_dir2_data_log_unused(struct xfs_da_args *args, extern void xfs_dir2_data_make_free(struct xfs_da_args *args, struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_da_args *args, +extern int xfs_dir2_data_use_free(struct xfs_da_args *args, struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 2da86a394bcf..875893ded514 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -451,15 +451,19 @@ xfs_dir2_block_addname( * No stale entries, will use enddup space to hold new leaf. */ if (!btp->stale) { + xfs_dir2_data_aoff_t aoff; + /* * Mark the space needed for the new leaf entry, now in use. */ - xfs_dir2_data_use_free(args, bp, enddup, - (xfs_dir2_data_aoff_t) - ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - - sizeof(*blp)), - (xfs_dir2_data_aoff_t)sizeof(*blp), - &needlog, &needscan); + aoff = (xfs_dir2_data_aoff_t)((char *)enddup - (char *)hdr + + be16_to_cpu(enddup->length) - sizeof(*blp)); + error = xfs_dir2_data_use_free(args, bp, enddup, aoff, + (xfs_dir2_data_aoff_t)sizeof(*blp), &needlog, + &needscan); + if (error) + return error; + /* * Update the tail (entry count). */ @@ -541,9 +545,11 @@ xfs_dir2_block_addname( /* * Mark space for the data entry used. */ - xfs_dir2_data_use_free(args, bp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), - (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + error = xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + (xfs_dir2_data_aoff_t)len, &needlog, &needscan); + if (error) + return error; /* * Create the new data entry. */ @@ -997,8 +1003,10 @@ xfs_dir2_leaf_to_block( /* * Use up the space at the end of the block (blp/btp). */ - xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, - &needlog, &needscan); + error = xfs_dir2_data_use_free(args, dbp, dup, + args->geo->blksize - size, size, &needlog, &needscan); + if (error) + return error; /* * Initialize the block tail. */ @@ -1110,18 +1118,14 @@ xfs_dir2_sf_to_block( * Add block 0 to the inode. */ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); - if (error) { - kmem_free(sfp); - return error; - } + if (error) + goto out_free; /* * Initialize the data block, then convert it to block format. */ error = xfs_dir3_data_init(args, blkno, &bp); - if (error) { - kmem_free(sfp); - return error; - } + if (error) + goto out_free; xfs_dir3_block_init(mp, tp, bp, dp); hdr = bp->b_addr; @@ -1136,8 +1140,10 @@ xfs_dir2_sf_to_block( */ dup = dp->d_ops->data_unused_p(hdr); needlog = needscan = 0; - xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, - i, &needlog, &needscan); + error = xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, + i, &needlog, &needscan); + if (error) + goto out_free; ASSERT(needscan == 0); /* * Fill in the tail. @@ -1150,9 +1156,11 @@ xfs_dir2_sf_to_block( /* * Remove the freespace, we'll manage it. */ - xfs_dir2_data_use_free(args, bp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), - be16_to_cpu(dup->length), &needlog, &needscan); + error = xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + be16_to_cpu(dup->length), &needlog, &needscan); + if (error) + goto out_free; /* * Create entry for . */ @@ -1256,4 +1264,7 @@ xfs_dir2_sf_to_block( xfs_dir2_block_log_tail(tp, bp); xfs_dir3_data_check(dp, bp); return 0; +out_free: + kmem_free(sfp); + return error; } diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 920279485275..cb67ec730b9b 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -932,10 +932,51 @@ xfs_dir2_data_make_free( *needscanp = needscan; } +/* Check our free data for obvious signs of corruption. */ +static inline xfs_failaddr_t +xfs_dir2_data_check_free( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_unused *dup, + xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len) +{ + if (hdr->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC) && + hdr->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC) && + hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) && + hdr->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return __this_address; + if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG) + return __this_address; + if (offset < (char *)dup - (char *)hdr) + return __this_address; + if (offset + len > (char *)dup + be16_to_cpu(dup->length) - (char *)hdr) + return __this_address; + if ((char *)dup - (char *)hdr != + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))) + return __this_address; + return NULL; +} + +/* Sanity-check a new bestfree entry. */ +static inline xfs_failaddr_t +xfs_dir2_data_check_new_free( + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_free *dfp, + struct xfs_dir2_data_unused *newdup) +{ + if (dfp == NULL) + return __this_address; + if (dfp->length != newdup->length) + return __this_address; + if (be16_to_cpu(dfp->offset) != (char *)newdup - (char *)hdr) + return __this_address; + return NULL; +} + /* * Take a byte range out of an existing unused space and make it un-free. */ -void +int xfs_dir2_data_use_free( struct xfs_da_args *args, struct xfs_buf *bp, @@ -947,23 +988,19 @@ xfs_dir2_data_use_free( { xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ + xfs_dir2_data_unused_t *newdup; /* new unused entry */ + xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ + struct xfs_dir2_data_free *bf; + xfs_failaddr_t fa; int matchback; /* matches end of freespace */ int matchfront; /* matches start of freespace */ int needscan; /* need to regen bestfree */ - xfs_dir2_data_unused_t *newdup; /* new unused entry */ - xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ - struct xfs_dir2_data_free *bf; hdr = bp->b_addr; - ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || - hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); - ASSERT(offset >= (char *)dup - (char *)hdr); - ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); - ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + fa = xfs_dir2_data_check_free(hdr, dup, offset, len); + if (fa) + goto corrupt; /* * Look up the entry in the bestfree table. */ @@ -1008,9 +1045,9 @@ xfs_dir2_data_use_free( xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); - ASSERT(dfp != NULL); - ASSERT(dfp->length == newdup->length); - ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); + fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup); + if (fa) + goto corrupt; /* * If we got inserted at the last slot, * that means we don't know if there was a better @@ -1036,9 +1073,9 @@ xfs_dir2_data_use_free( xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); - ASSERT(dfp != NULL); - ASSERT(dfp->length == newdup->length); - ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); + fa = xfs_dir2_data_check_new_free(hdr, dfp, newdup); + if (fa) + goto corrupt; /* * If we got inserted at the last slot, * that means we don't know if there was a better @@ -1084,6 +1121,11 @@ xfs_dir2_data_use_free( } } *needscanp = needscan; + return 0; +corrupt: + xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount, + hdr, __FILE__, __LINE__, fa); + return -EFSCORRUPTED; } /* Find the end of the entry data in a data/block format dir block. */ diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index d61d52da95a1..50fc9c0c5e2b 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -877,9 +877,13 @@ xfs_dir2_leaf_addname( /* * Mark the initial part of our freespace in use for the new entry. */ - xfs_dir2_data_use_free(args, dbp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, - &needlog, &needscan); + error = xfs_dir2_data_use_free(args, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), + length, &needlog, &needscan); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } /* * Initialize our new entry (at last). */ diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 0839ffe27e02..9df096cc3c37 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -1729,6 +1729,7 @@ xfs_dir2_node_addname_int( __be16 *bests; struct xfs_dir3_icfree_hdr freehdr; struct xfs_dir2_data_free *bf; + xfs_dir2_data_aoff_t aoff; dp = args->dp; mp = dp->i_mount; @@ -2023,9 +2024,13 @@ xfs_dir2_node_addname_int( /* * Mark the first part of the unused space, inuse for us. */ - xfs_dir2_data_use_free(args, dbp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, - &needlog, &needscan); + aoff = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + error = xfs_dir2_data_use_free(args, dbp, dup, aoff, length, + &needlog, &needscan); + if (error) { + xfs_trans_brelse(tp, dbp); + return error; + } /* * Fill in the new entry and log it. */ -- cgit v1.2.3 From 30b0984d9117dd14c895265886d34335856b712b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:52 -0700 Subject: xfs: refactor bmap record validation Refactor the bmap validator into a more complete helper that looks for extents that run off the end of the device, overflow into the next AG, or have invalid flag states. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_bmap.c | 46 +++++++++++++++++++++++++++++++++++++++--- fs/xfs/libxfs/xfs_bmap.h | 3 +++ fs/xfs/libxfs/xfs_bmap_btree.h | 14 ------------- fs/xfs/libxfs/xfs_inode_fork.c | 12 +++++++---- 4 files changed, 54 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index daae00ed30c5..91214f21563c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1261,11 +1261,15 @@ xfs_iread_extents( */ frp = XFS_BMBT_REC_ADDR(mp, block, 1); for (j = 0; j < num_recs; j++, frp++, i++) { + xfs_failaddr_t fa; + xfs_bmbt_disk_get_all(frp, &new); - if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) { - XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, mp); + fa = xfs_bmap_validate_extent(ip, whichfork, &new); + if (fa) { error = -EFSCORRUPTED; + xfs_inode_verifier_error(ip, error, + "xfs_iread_extents(2)", + frp, sizeof(*frp), fa); goto out_brelse; } xfs_iext_insert(ip, &icur, &new, state); @@ -6154,3 +6158,39 @@ xfs_bmap_finish_one( return error; } + +/* Check that an inode's extent does not have invalid flags or bad ranges. */ +xfs_failaddr_t +xfs_bmap_validate_extent( + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *irec) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsblock_t endfsb; + bool isrt; + + isrt = XFS_IS_REALTIME_INODE(ip); + endfsb = irec->br_startblock + irec->br_blockcount - 1; + if (isrt) { + if (!xfs_verify_rtbno(mp, irec->br_startblock)) + return __this_address; + if (!xfs_verify_rtbno(mp, endfsb)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, irec->br_startblock)) + return __this_address; + if (!xfs_verify_fsbno(mp, endfsb)) + return __this_address; + if (XFS_FSB_TO_AGNO(mp, irec->br_startblock) != + XFS_FSB_TO_AGNO(mp, endfsb)) + return __this_address; + } + if (irec->br_state != XFS_EXT_NORM) { + if (whichfork != XFS_DATA_FORK) + return __this_address; + if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) + return __this_address; + } + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e36d75799cd5..f3be6416260b 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -274,4 +274,7 @@ static inline int xfs_bmap_fork_to_state(int whichfork) } } +xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, + struct xfs_bmbt_irec *irec); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 135b8c56d23e..e4505746ccaa 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -118,18 +118,4 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); -/* - * Check that the extent does not contain an invalid unwritten extent flag. - */ -static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork, - struct xfs_bmbt_irec *irec) -{ - if (irec->br_state == XFS_EXT_NORM) - return true; - if (whichfork == XFS_DATA_FORK && - xfs_sb_version_hasextflgbit(&mp->m_sb)) - return true; - return false; -} - #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 866d2861c625..613fba22ae4e 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -245,10 +245,14 @@ xfs_iformat_extents( xfs_iext_first(ifp, &icur); for (i = 0; i < nex; i++, dp++) { + xfs_failaddr_t fa; + xfs_bmbt_disk_get_all(dp, &new); - if (!xfs_bmbt_validate_extent(mp, whichfork, &new)) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, mp); + fa = xfs_bmap_validate_extent(ip, whichfork, &new); + if (fa) { + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iformat_extents(2)", + dp, sizeof(*dp), fa); return -EFSCORRUPTED; } @@ -595,7 +599,7 @@ xfs_iextents_copy( for_each_xfs_iext(ifp, &icur, &rec) { if (isnullstartblock(rec.br_startblock)) continue; - ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, &rec)); + ASSERT(xfs_bmap_validate_extent(ip, whichfork, &rec) == NULL); xfs_bmbt_disk_set_all(dp, &rec); trace_xfs_write_extent(ip, &icur, state, _RET_IP_); copied += sizeof(struct xfs_bmbt_rec); -- cgit v1.2.3 From 90a58f95717b46f67756580ad5f8b698304e4bad Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:52 -0700 Subject: xfs: refactor inode verifier error logging Refactor some of the inode verifier failure logging call sites to use the new xfs_inode_verifier_error method which dumps the offending buffer as well as the code location of the failed check. This trims the output, makes it clearer to the admin that repair must be run, and gives the developers more details to work from. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_bmap.c | 5 +++-- fs/xfs/libxfs/xfs_inode_fork.c | 15 +++++++++------ 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 91214f21563c..3b03d886df66 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1244,8 +1244,9 @@ xfs_iread_extents( xfs_warn(ip->i_mount, "corrupt dinode %Lu, (btree extents).", (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR(__func__, - XFS_ERRLEVEL_LOW, ip->i_mount, block); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + __func__, block, sizeof(*block), + __this_address); error = -EFSCORRUPTED; goto out_brelse; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 613fba22ae4e..701c42a28d05 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -195,8 +195,9 @@ xfs_iformat_local( "corrupt inode %Lu (bad size %d for local fork, size = %d).", (unsigned long long) ip->i_ino, size, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); - XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iformat_local", dip, sizeof(*dip), + __this_address); return -EFSCORRUPTED; } @@ -231,8 +232,9 @@ xfs_iformat_extents( if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", (unsigned long long) ip->i_ino, nex); - XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - mp, dip); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iformat_extents(1)", dip, sizeof(*dip), + __this_address); return -EFSCORRUPTED; } @@ -309,8 +311,9 @@ xfs_iformat_btree( level == 0 || level > XFS_BTREE_MAXLEVELS) { xfs_warn(mp, "corrupt inode %Lu (btree).", (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - mp, dip); + xfs_inode_verifier_error(ip, -EFSCORRUPTED, + "xfs_iformat_btree", dfp, size, + __this_address); return -EFSCORRUPTED; } -- cgit v1.2.3 From 6edb181053f067cee64d4239830062cb40ddab00 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:53 -0700 Subject: xfs: refactor inode buffer verifier error logging When the inode buffer verifier encounters an error, it's much more helpful to print a buffer from the offending inode instead of just the start of the inode chunk buffer. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_inode_buf.c | 4 +++- fs/xfs/xfs_error.c | 29 ++++++++++++++++++++++++----- fs/xfs/xfs_error.h | 3 +++ 3 files changed, 30 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 4fe17b368316..51019e5dc16d 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -115,13 +115,15 @@ xfs_inode_buf_verify( return; } - xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", (unsigned long long)bp->b_bn, i, be16_to_cpu(dip->di_magic)); #endif + xfs_buf_verifier_error(bp, -EFSCORRUPTED, + __func__, dip, sizeof(*dip), + NULL); } } xfs_inobp_check(mp, bp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ccf520f0b00d..a63f5083f497 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -347,33 +347,52 @@ xfs_corruption_error( * values, and omit the stack trace unless the error level is tuned high. */ void -xfs_verifier_error( +xfs_buf_verifier_error( struct xfs_buf *bp, int error, + const char *name, + void *buf, + size_t bufsz, xfs_failaddr_t failaddr) { struct xfs_mount *mp = bp->b_target->bt_mount; xfs_failaddr_t fa; + int sz; fa = failaddr ? failaddr : __return_address; __xfs_buf_ioerror(bp, error, fa); - xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - fa, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, bp->b_bn, name); xfs_alert(mp, "Unmount and run xfs_repair"); if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + sz = min_t(size_t, XFS_CORRUPTION_DUMP_LEN, bufsz); xfs_alert(mp, "First %d bytes of corrupted metadata buffer:", - XFS_CORRUPTION_DUMP_LEN); - xfs_hex_dump(xfs_buf_offset(bp, 0), XFS_CORRUPTION_DUMP_LEN); + sz); + xfs_hex_dump(buf, sz); } if (xfs_error_level >= XFS_ERRLEVEL_HIGH) xfs_stack_trace(); } +/* + * Warnings specifically for verifier errors. Differentiate CRC vs. invalid + * values, and omit the stack trace unless the error level is tuned high. + */ +void +xfs_verifier_error( + struct xfs_buf *bp, + int error, + xfs_failaddr_t failaddr) +{ + return xfs_buf_verifier_error(bp, error, "", xfs_buf_offset(bp, 0), + XFS_CORRUPTION_DUMP_LEN, failaddr); +} + /* * Warnings for inode corruption problems. Don't bother with the stack * trace unless the error level is turned up high. diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 7e728c5a46b8..ce391349e78b 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -26,6 +26,9 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, int linenum, xfs_failaddr_t failaddr); +extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error, + const char *name, void *buf, size_t bufsz, + xfs_failaddr_t failaddr); extern void xfs_verifier_error(struct xfs_buf *bp, int error, xfs_failaddr_t failaddr); extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error, -- cgit v1.2.3 From 5e777b62b0bcb645f165fe5e056fe8862782affc Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:53 -0700 Subject: xfs: bmap scrubber should do rmap xref with bmap for sparse files When we're scanning an extent mapping inode fork, ensure that every rmap record for this ifork has a corresponding bmbt record too. This (mostly) provides the ability to cross-reference rmap records with bmap data. The rmap scrubber cannot do the xref on its own because that requires taking an ilock with the agf lock held, which violates our locking order rules (inode, then agf). Note that we only do this for forks that are in btree format due to the increased complexity; or forks that should have data but suspiciously have zero extents because the inode could have just had its iforks zapped by the inode repair code and now we need to reclaim the old extents. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/bmap.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 169 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index d00282130492..75ea2d63429c 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -37,6 +37,7 @@ #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" #include "xfs_rmap.h" +#include "xfs_rmap_btree.h" #include "xfs_refcount.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" @@ -423,6 +424,169 @@ xfs_scrub_bmap_btree( return error; } +struct xfs_scrub_bmap_check_rmap_info { + struct xfs_scrub_context *sc; + int whichfork; + struct xfs_iext_cursor icur; +}; + +/* Can we find bmaps that fit this rmap? */ +STATIC int +xfs_scrub_bmap_check_rmap( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_bmbt_irec irec; + struct xfs_scrub_bmap_check_rmap_info *sbcri = priv; + struct xfs_ifork *ifp; + struct xfs_scrub_context *sc = sbcri->sc; + bool have_map; + + /* Is this even the right fork? */ + if (rec->rm_owner != sc->ip->i_ino) + return 0; + if ((sbcri->whichfork == XFS_ATTR_FORK) ^ + !!(rec->rm_flags & XFS_RMAP_ATTR_FORK)) + return 0; + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + return 0; + + /* Now look up the bmbt record. */ + ifp = XFS_IFORK_PTR(sc->ip, sbcri->whichfork); + if (!ifp) { + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + goto out; + } + have_map = xfs_iext_lookup_extent(sc->ip, ifp, rec->rm_offset, + &sbcri->icur, &irec); + if (!have_map) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + /* + * bmap extent record lengths are constrained to 2^21 blocks in length + * because of space constraints in the on-disk metadata structure. + * However, rmap extent record lengths are constrained only by AG + * length, so we have to loop through the bmbt to make sure that the + * entire rmap is covered by bmbt records. + */ + while (have_map) { + if (irec.br_startoff != rec->rm_offset) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, + cur->bc_private.a.agno, rec->rm_startblock)) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + if (irec.br_blockcount > rec->rm_blockcount) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + break; + rec->rm_startblock += irec.br_blockcount; + rec->rm_offset += irec.br_blockcount; + rec->rm_blockcount -= irec.br_blockcount; + if (rec->rm_blockcount == 0) + break; + have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec); + if (!have_map) + xfs_scrub_fblock_set_corrupt(sc, sbcri->whichfork, + rec->rm_offset); + } + +out: + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return XFS_BTREE_QUERY_RANGE_ABORT; + return 0; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xfs_scrub_bmap_check_ag_rmaps( + struct xfs_scrub_context *sc, + int whichfork, + xfs_agnumber_t agno) +{ + struct xfs_scrub_bmap_check_rmap_info sbcri; + struct xfs_btree_cur *cur; + struct xfs_buf *agf; + int error; + + error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf); + if (error) + return error; + + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf, agno); + if (!cur) { + error = -ENOMEM; + goto out_agf; + } + + sbcri.sc = sc; + sbcri.whichfork = whichfork; + error = xfs_rmap_query_all(cur, xfs_scrub_bmap_check_rmap, &sbcri); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + error = 0; + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); +out_agf: + xfs_trans_brelse(sc->tp, agf); + return error; +} + +/* Make sure each rmap has a corresponding bmbt entry. */ +STATIC int +xfs_scrub_bmap_check_rmaps( + struct xfs_scrub_context *sc, + int whichfork) +{ + loff_t size; + xfs_agnumber_t agno; + int error; + + if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) || + whichfork == XFS_COW_FORK || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return 0; + + /* Don't support realtime rmap checks yet. */ + if (XFS_IS_REALTIME_INODE(sc->ip) && whichfork == XFS_DATA_FORK) + return 0; + + /* + * Only do this for complex maps that are in btree format, or for + * situations where we would seem to have a size but zero extents. + * The inode repair code can zap broken iforks, which means we have + * to flag this bmap as corrupt if there are rmaps that need to be + * reattached. + */ + switch (whichfork) { + case XFS_DATA_FORK: + size = i_size_read(VFS_I(sc->ip)); + break; + case XFS_ATTR_FORK: + size = XFS_IFORK_Q(sc->ip); + break; + default: + size = 0; + break; + } + if (XFS_IFORK_FORMAT(sc->ip, whichfork) != XFS_DINODE_FMT_BTREE && + (size == 0 || XFS_IFORK_NEXTENTS(sc->ip, whichfork) > 0)) + return 0; + + for (agno = 0; agno < sc->mp->m_sb.sb_agcount; agno++) { + error = xfs_scrub_bmap_check_ag_rmaps(sc, whichfork, agno); + if (error) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + break; + } + + return 0; +} + /* * Scrub an inode fork's block mappings. * @@ -463,7 +627,7 @@ xfs_scrub_bmap( break; case XFS_ATTR_FORK: if (!ifp) - goto out; + goto out_check_rmap; if (!xfs_sb_version_hasattr(&mp->m_sb) && !xfs_sb_version_hasattr2(&mp->m_sb)) xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); @@ -534,6 +698,10 @@ xfs_scrub_bmap( goto out; } +out_check_rmap: + error = xfs_scrub_bmap_check_rmaps(sc, whichfork); + if (!xfs_scrub_fblock_xref_process_error(sc, whichfork, 0, &error)) + goto out; out: return error; } -- cgit v1.2.3 From d0018ad88909a959e17132d694a7ad917a14883f Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:54 -0700 Subject: xfs: inode scrubber shouldn't bother with raw checks The inode scrubber tries to _iget the inode prior to running checks. If that _iget call fails with corruption errors that's an automatic fail, regardless of whether it was the inode buffer read verifier, the ifork verifier, or the ifork formatter that errored out. Therefore, get rid of the raw mode scrub code because it's not needed. Found by trying to fix some test failures in xfs/379 and xfs/415. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/inode.c | 102 +++++++-------------------------------------------- 1 file changed, 13 insertions(+), 89 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 21297bef8df1..177e9788030d 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -515,72 +515,6 @@ xfs_scrub_dinode( } } -/* Map and read a raw inode. */ -STATIC int -xfs_scrub_inode_map_raw( - struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf **bpp, - struct xfs_dinode **dipp) -{ - struct xfs_imap imap; - struct xfs_mount *mp = sc->mp; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - int error; - - error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); - if (error == -EINVAL) { - /* - * Inode could have gotten deleted out from under us; - * just forget about it. - */ - error = -ENOENT; - goto out; - } - if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), - XFS_INO_TO_AGBNO(mp, ino), &error)) - goto out; - - error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, - NULL); - if (!xfs_scrub_process_error(sc, XFS_INO_TO_AGNO(mp, ino), - XFS_INO_TO_AGBNO(mp, ino), &error)) - goto out; - - /* - * Is this really an inode? We disabled verifiers in the above - * xfs_trans_read_buf call because the inode buffer verifier - * fails on /any/ inode record in the inode cluster with a bad - * magic or version number, not just the one that we're - * checking. Therefore, grab the buffer unconditionally, attach - * the inode verifiers by hand, and run the inode verifier only - * on the one inode we want. - */ - bp->b_ops = &xfs_inode_buf_ops; - dip = xfs_buf_offset(bp, imap.im_boffset); - if (xfs_dinode_verify(mp, ino, dip) != NULL || - !xfs_dinode_good_version(mp, dip->di_version)) { - xfs_scrub_ino_set_corrupt(sc, ino, bp); - goto out_buf; - } - - /* ...and is it the one we asked for? */ - if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { - error = -ENOENT; - goto out_buf; - } - - *dipp = dip; - *bpp = bp; -out: - return error; -out_buf: - xfs_trans_brelse(sc->tp, bp); - return error; -} - /* * Make sure the finobt doesn't think this inode is free. * We don't have to check the inobt ourselves because we got the inode via @@ -727,43 +661,33 @@ xfs_scrub_inode( struct xfs_scrub_context *sc) { struct xfs_dinode di; - struct xfs_buf *bp = NULL; - struct xfs_dinode *dip; - xfs_ino_t ino; int error = 0; - /* Did we get the in-core inode, or are we doing this manually? */ - if (sc->ip) { - ino = sc->ip->i_ino; - xfs_inode_to_disk(sc->ip, &di, 0); - dip = &di; - } else { - /* Map & read inode. */ - ino = sc->sm->sm_ino; - error = xfs_scrub_inode_map_raw(sc, ino, &bp, &dip); - if (error || !bp) - goto out; + /* + * If sc->ip is NULL, that means that the setup function called + * xfs_iget to look up the inode. xfs_iget returned a EFSCORRUPTED + * and a NULL inode, so flag the corruption error and return. + */ + if (!sc->ip) { + xfs_scrub_ino_set_corrupt(sc, sc->sm->sm_ino, NULL); + return 0; } - xfs_scrub_dinode(sc, bp, dip, ino); + /* Scrub the inode core. */ + xfs_inode_to_disk(sc->ip, &di, 0); + xfs_scrub_dinode(sc, NULL, &di, sc->ip->i_ino); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; - /* Now let's do the things that require a live inode. */ - if (!sc->ip) - goto out; - /* * Look for discrepancies between file's data blocks and the reflink * iflag. We already checked the iflag against the file mode when * we scrubbed the dinode. */ if (S_ISREG(VFS_I(sc->ip)->i_mode)) - xfs_scrub_inode_check_reflink_iflag(sc, ino, bp); + xfs_scrub_inode_check_reflink_iflag(sc, sc->ip->i_ino, NULL); - xfs_scrub_inode_xref(sc, ino, dip); + xfs_scrub_inode_xref(sc, sc->ip->i_ino, &di); out: - if (bp) - xfs_trans_brelse(sc->tp, bp); return error; } -- cgit v1.2.3 From 7e56d9eaea1397efbac7e6813cbb74066586fdd4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:54 -0700 Subject: xfs: remove xfs_buf parameter from inode scrub methods Now that we no longer do raw inode buffer scrubbing, the bp parameter is no longer used anywhere we're dealing with an inode, so remove it and all the useless NULL parameters that go with it. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/attr.c | 2 +- fs/xfs/scrub/bmap.c | 4 +- fs/xfs/scrub/common.c | 22 ++++------ fs/xfs/scrub/common.h | 13 +++--- fs/xfs/scrub/dir.c | 2 +- fs/xfs/scrub/inode.c | 106 +++++++++++++++++++++++------------------------- fs/xfs/scrub/quota.c | 2 +- fs/xfs/scrub/rtbitmap.c | 3 +- fs/xfs/scrub/trace.h | 31 +++----------- 9 files changed, 74 insertions(+), 111 deletions(-) (limited to 'fs') diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 4ed80474f545..127575f0abfb 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -98,7 +98,7 @@ xfs_scrub_xattr_listent( if (flags & XFS_ATTR_INCOMPLETE) { /* Incomplete attr key, just mark the inode for preening. */ - xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino, NULL); + xfs_scrub_ino_set_preen(sx->sc, context->dp->i_ino); return; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 75ea2d63429c..639d14b51e90 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -621,7 +621,7 @@ xfs_scrub_bmap( goto out; /* No CoW forks on non-reflink inodes/filesystems. */ if (!xfs_is_reflink_inode(ip)) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); goto out; } break; @@ -630,7 +630,7 @@ xfs_scrub_bmap( goto out_check_rmap; if (!xfs_sb_version_hasattr(&mp->m_sb) && !xfs_sb_version_hasattr2(&mp->m_sb)) - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); break; default: ASSERT(whichfork == XFS_DATA_FORK); diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index ddcdda336402..8ed91d5c868d 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -213,12 +213,10 @@ xfs_scrub_block_set_preen( void xfs_scrub_ino_set_preen( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; - trace_xfs_scrub_ino_preen(sc, ino, bp ? bp->b_bn : 0, - __return_address); + trace_xfs_scrub_ino_preen(sc, ino, __return_address); } /* Record a corrupt block. */ @@ -249,22 +247,20 @@ xfs_scrub_block_xref_set_corrupt( void xfs_scrub_ino_set_corrupt( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; - trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); + trace_xfs_scrub_ino_error(sc, ino, __return_address); } /* Record a corruption while cross-referencing with an inode. */ void xfs_scrub_ino_xref_set_corrupt( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; - trace_xfs_scrub_ino_error(sc, ino, bp ? bp->b_bn : 0, __return_address); + trace_xfs_scrub_ino_error(sc, ino, __return_address); } /* Record corruption in a block indexed by a file fork. */ @@ -296,12 +292,10 @@ xfs_scrub_fblock_xref_set_corrupt( void xfs_scrub_ino_set_warning( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_WARNING; - trace_xfs_scrub_ino_warning(sc, ino, bp ? bp->b_bn : 0, - __return_address); + trace_xfs_scrub_ino_warning(sc, ino, __return_address); } /* Warn about a block indexed by a file fork that needs review. */ diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index ddb65d22c76a..deaf60400981 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -63,25 +63,22 @@ bool xfs_scrub_fblock_xref_process_error(struct xfs_scrub_context *sc, void xfs_scrub_block_set_preen(struct xfs_scrub_context *sc, struct xfs_buf *bp); -void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino, - struct xfs_buf *bp); +void xfs_scrub_ino_set_preen(struct xfs_scrub_context *sc, xfs_ino_t ino); void xfs_scrub_block_set_corrupt(struct xfs_scrub_context *sc, struct xfs_buf *bp); -void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, - struct xfs_buf *bp); +void xfs_scrub_ino_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino); void xfs_scrub_fblock_set_corrupt(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset); void xfs_scrub_block_xref_set_corrupt(struct xfs_scrub_context *sc, struct xfs_buf *bp); -void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc, xfs_ino_t ino, - struct xfs_buf *bp); +void xfs_scrub_ino_xref_set_corrupt(struct xfs_scrub_context *sc, + xfs_ino_t ino); void xfs_scrub_fblock_xref_set_corrupt(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset); -void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino, - struct xfs_buf *bp); +void xfs_scrub_ino_set_warning(struct xfs_scrub_context *sc, xfs_ino_t ino); void xfs_scrub_fblock_set_warning(struct xfs_scrub_context *sc, int whichfork, xfs_fileoff_t offset); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 50b6a26b0299..38f29806eb54 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -781,7 +781,7 @@ xfs_scrub_directory( /* Plausible size? */ if (sc->ip->i_d.di_size < xfs_dir2_sf_hdr_size(0)) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); goto out; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 177e9788030d..9be1655e7e9f 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -98,7 +98,6 @@ out: STATIC void xfs_scrub_inode_extsize( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino, uint16_t mode, @@ -149,7 +148,7 @@ xfs_scrub_inode_extsize( return; bad: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } /* @@ -161,7 +160,6 @@ bad: STATIC void xfs_scrub_inode_cowextsize( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino, uint16_t mode, @@ -205,14 +203,13 @@ xfs_scrub_inode_cowextsize( return; bad: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } /* Make sure the di_flags make sense for the inode. */ STATIC void xfs_scrub_inode_flags( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino, uint16_t mode, @@ -251,14 +248,13 @@ xfs_scrub_inode_flags( return; bad: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } /* Make sure the di_flags2 make sense for the inode. */ STATIC void xfs_scrub_inode_flags2( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino, uint16_t mode, @@ -295,14 +291,13 @@ xfs_scrub_inode_flags2( return; bad: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } /* Scrub all the ondisk inode fields. */ STATIC void xfs_scrub_dinode( struct xfs_scrub_context *sc, - struct xfs_buf *bp, struct xfs_dinode *dip, xfs_ino_t ino) { @@ -333,7 +328,7 @@ xfs_scrub_dinode( /* mode is recognized */ break; default: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; } @@ -344,22 +339,22 @@ xfs_scrub_dinode( * We autoconvert v1 inodes into v2 inodes on writeout, * so just mark this inode for preening. */ - xfs_scrub_ino_set_preen(sc, ino, bp); + xfs_scrub_ino_set_preen(sc, ino); break; case 2: case 3: if (dip->di_onlink != 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (dip->di_mode == 0 && sc->ip) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (dip->di_projid_hi != 0 && !xfs_sb_version_hasprojid32bit(&mp->m_sb)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; default: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); return; } @@ -369,40 +364,40 @@ xfs_scrub_dinode( */ if (dip->di_uid == cpu_to_be32(-1U) || dip->di_gid == cpu_to_be32(-1U)) - xfs_scrub_ino_set_warning(sc, ino, bp); + xfs_scrub_ino_set_warning(sc, ino); /* di_format */ switch (dip->di_format) { case XFS_DINODE_FMT_DEV: if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode) && !S_ISSOCK(mode)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_LOCAL: if (!S_ISDIR(mode) && !S_ISLNK(mode)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_EXTENTS: if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: if (!S_ISREG(mode) && !S_ISDIR(mode)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_UUID: default: - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; } /* di_[amc]time.nsec */ if (be32_to_cpu(dip->di_atime.t_nsec) >= NSEC_PER_SEC) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (be32_to_cpu(dip->di_mtime.t_nsec) >= NSEC_PER_SEC) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (be32_to_cpu(dip->di_ctime.t_nsec) >= NSEC_PER_SEC) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* * di_size. xfs_dinode_verify checks for things that screw up @@ -411,19 +406,19 @@ xfs_scrub_dinode( */ isize = be64_to_cpu(dip->di_size); if (isize & (1ULL << 63)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* Devices, fifos, and sockets must have zero size */ if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* Directories can't be larger than the data section size (32G) */ if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* Symlinks can't be larger than SYMLINK_MAXLEN */ if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN)) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* * Warn if the running kernel can't handle the kinds of offsets @@ -432,7 +427,7 @@ xfs_scrub_dinode( * overly large offsets, flag the inode for admin review. */ if (isize >= mp->m_super->s_maxbytes) - xfs_scrub_ino_set_warning(sc, ino, bp); + xfs_scrub_ino_set_warning(sc, ino); /* di_nblocks */ if (flags2 & XFS_DIFLAG2_REFLINK) { @@ -447,15 +442,15 @@ xfs_scrub_dinode( */ if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } else { if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } - xfs_scrub_inode_flags(sc, bp, dip, ino, mode, flags); + xfs_scrub_inode_flags(sc, dip, ino, mode, flags); - xfs_scrub_inode_extsize(sc, bp, dip, ino, mode, flags); + xfs_scrub_inode_extsize(sc, dip, ino, mode, flags); /* di_nextents */ nextents = be32_to_cpu(dip->di_nextents); @@ -463,31 +458,31 @@ xfs_scrub_dinode( switch (dip->di_format) { case XFS_DINODE_FMT_EXTENTS: if (nextents > fork_recs) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: if (nextents <= fork_recs) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; default: if (nextents != 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; } /* di_forkoff */ if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (dip->di_anextents != 0 && dip->di_forkoff == 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* di_aformat */ if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && dip->di_aformat != XFS_DINODE_FMT_EXTENTS && dip->di_aformat != XFS_DINODE_FMT_BTREE) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); /* di_anextents */ nextents = be16_to_cpu(dip->di_anextents); @@ -495,22 +490,22 @@ xfs_scrub_dinode( switch (dip->di_aformat) { case XFS_DINODE_FMT_EXTENTS: if (nextents > fork_recs) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; case XFS_DINODE_FMT_BTREE: if (nextents <= fork_recs) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); break; default: if (nextents != 0) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } if (dip->di_version >= 3) { if (be32_to_cpu(dip->di_crtime.t_nsec) >= NSEC_PER_SEC) - xfs_scrub_ino_set_corrupt(sc, ino, bp); - xfs_scrub_inode_flags2(sc, bp, dip, ino, mode, flags, flags2); - xfs_scrub_inode_cowextsize(sc, bp, dip, ino, mode, flags, + xfs_scrub_ino_set_corrupt(sc, ino); + xfs_scrub_inode_flags2(sc, dip, ino, mode, flags, flags2); + xfs_scrub_inode_cowextsize(sc, dip, ino, mode, flags, flags2); } } @@ -579,18 +574,18 @@ xfs_scrub_inode_xref_bmap( if (!xfs_scrub_should_check_xref(sc, &error, NULL)) return; if (nextents < be32_to_cpu(dip->di_nextents)) - xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino); error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, &nextents, &acount); if (!xfs_scrub_should_check_xref(sc, &error, NULL)) return; if (nextents != be16_to_cpu(dip->di_anextents)) - xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino); /* Check nblocks against the inode. */ if (count + acount != be64_to_cpu(dip->di_nblocks)) - xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_xref_set_corrupt(sc, sc->ip->i_ino); } /* Cross-reference with the other btrees. */ @@ -634,8 +629,7 @@ xfs_scrub_inode_xref( static void xfs_scrub_inode_check_reflink_iflag( struct xfs_scrub_context *sc, - xfs_ino_t ino, - struct xfs_buf *bp) + xfs_ino_t ino) { struct xfs_mount *mp = sc->mp; bool has_shared; @@ -650,9 +644,9 @@ xfs_scrub_inode_check_reflink_iflag( XFS_INO_TO_AGBNO(mp, ino), &error)) return; if (xfs_is_reflink_inode(sc->ip) && !has_shared) - xfs_scrub_ino_set_preen(sc, ino, bp); + xfs_scrub_ino_set_preen(sc, ino); else if (!xfs_is_reflink_inode(sc->ip) && has_shared) - xfs_scrub_ino_set_corrupt(sc, ino, bp); + xfs_scrub_ino_set_corrupt(sc, ino); } /* Scrub an inode. */ @@ -669,13 +663,13 @@ xfs_scrub_inode( * and a NULL inode, so flag the corruption error and return. */ if (!sc->ip) { - xfs_scrub_ino_set_corrupt(sc, sc->sm->sm_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->sm->sm_ino); return 0; } /* Scrub the inode core. */ xfs_inode_to_disk(sc->ip, &di, 0); - xfs_scrub_dinode(sc, NULL, &di, sc->ip->i_ino); + xfs_scrub_dinode(sc, &di, sc->ip->i_ino); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) goto out; @@ -685,7 +679,7 @@ xfs_scrub_inode( * we scrubbed the dinode. */ if (S_ISREG(VFS_I(sc->ip)->i_mode)) - xfs_scrub_inode_check_reflink_iflag(sc, sc->ip->i_ino, NULL); + xfs_scrub_inode_check_reflink_iflag(sc, sc->ip->i_ino); xfs_scrub_inode_xref(sc, sc->ip->i_ino, &di); out: diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 51daa4ae2627..6ba465e6c885 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -219,7 +219,7 @@ xfs_scrub_quota( /* Look for problem extents. */ xfs_ilock(ip, XFS_ILOCK_EXCL); if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino, NULL); + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); goto out_unlock_inode; } max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 26390991369a..39c41dfe08ee 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -116,8 +116,7 @@ xfs_scrub_xref_is_used_rt_space( if (!xfs_scrub_should_check_xref(sc, &error, NULL)) goto out_unlock; if (is_free) - xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino, - NULL); + xfs_scrub_ino_xref_set_corrupt(sc, sc->mp->m_rbmip->i_ino); out_unlock: xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 4dc896852bf0..5d2b1c241be5 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -174,53 +174,32 @@ DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_error); DEFINE_SCRUB_BLOCK_ERROR_EVENT(xfs_scrub_block_preen); DECLARE_EVENT_CLASS(xfs_scrub_ino_error_class, - TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, xfs_daddr_t daddr, - void *ret_ip), - TP_ARGS(sc, ino, daddr, ret_ip), + TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, void *ret_ip), + TP_ARGS(sc, ino, ret_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(unsigned int, type) - __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, bno) __field(void *, ret_ip) ), TP_fast_assign( - xfs_fsblock_t fsbno; - xfs_agnumber_t agno; - xfs_agblock_t bno; - - if (daddr) { - fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr); - agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); - bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); - } else { - agno = XFS_INO_TO_AGNO(sc->mp, ino); - bno = XFS_AGINO_TO_AGBNO(sc->mp, - XFS_INO_TO_AGINO(sc->mp, ino)); - } - __entry->dev = sc->mp->m_super->s_dev; __entry->ino = ino; __entry->type = sc->sm->sm_type; - __entry->agno = agno; - __entry->bno = bno; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx type %u agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx type %u ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->type, - __entry->agno, - __entry->bno, __entry->ret_ip) ) #define DEFINE_SCRUB_INO_ERROR_EVENT(name) \ DEFINE_EVENT(xfs_scrub_ino_error_class, name, \ TP_PROTO(struct xfs_scrub_context *sc, xfs_ino_t ino, \ - xfs_daddr_t daddr, void *ret_ip), \ - TP_ARGS(sc, ino, daddr, ret_ip)) + void *ret_ip), \ + TP_ARGS(sc, ino, ret_ip)) DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_error); DEFINE_SCRUB_INO_ERROR_EVENT(xfs_scrub_ino_preen); -- cgit v1.2.3 From 1b44a6aecc1193336f228204d2f3ae9f7737e78c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:55 -0700 Subject: xfs: record inode buf errors as a xref error in inobt scrubber During the inode btree scrubs we try to confirm the freemask bits against the inode records. If the inode buffer read fails, this is a cross-referencing error, not a corruption of the inode btree itself. Use the xref_process_error call here. Found via core.version middlebit fuzz in xfs/415. The userspace xfs_scrub program will try to repair outright corruptions in the agi/inobt prior to phase 3 so that the inode scan will proceed. If only a cross-referencing error is noted, the repair program defers the repair attempt until it can check the other space metadata at least once. It is therefore essential that the inobt scrubber can correctly distinguish between corruptions and "unable to cross-reference something else with this inobt". Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 63ab3f98430d..32e0d1ae0056 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -259,7 +259,8 @@ xfs_scrub_iallocbt_check_freemask( error = xfs_imap_to_bp(mp, bs->cur->bc_tp, &imap, &dip, &bp, 0, 0); - if (!xfs_scrub_btree_process_error(bs->sc, bs->cur, 0, &error)) + if (!xfs_scrub_btree_xref_process_error(bs->sc, bs->cur, 0, + &error)) continue; /* Which inodes are free? */ -- cgit v1.2.3 From 8bb82bc12a9e75dd47047d9a2e53135cc5e5787b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:55 -0700 Subject: xfs: move inode extent size hint validation to libxfs Extent size hint validation is used by scrub to decide if there's an error, and it will be used by repair to decide to remove the hint. Since these use the same validation functions, move them to libxfs. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_inode_buf.c | 105 ++++++++++++++++++++++++++++++++++++++++++ fs/xfs/libxfs/xfs_inode_buf.h | 5 ++ fs/xfs/scrub/inode.c | 100 +++++----------------------------------- 3 files changed, 122 insertions(+), 88 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 51019e5dc16d..cdd4c1d58e80 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -651,3 +651,108 @@ xfs_iread( xfs_trans_brelse(tp, bp); return error; } + +/* + * Validate di_extsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_extsize(). + * These functions must be kept in sync with each other. + */ +xfs_failaddr_t +xfs_inode_validate_extsize( + struct xfs_mount *mp, + uint32_t extsize, + uint16_t mode, + uint16_t flags) +{ + bool rt_flag; + bool hint_flag; + bool inherit_flag; + uint32_t extsize_bytes; + uint32_t blocksize_bytes; + + rt_flag = (flags & XFS_DIFLAG_REALTIME); + hint_flag = (flags & XFS_DIFLAG_EXTSIZE); + inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); + extsize_bytes = XFS_FSB_TO_B(mp, extsize); + + if (rt_flag) + blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; + else + blocksize_bytes = mp->m_sb.sb_blocksize; + + if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) + return __this_address; + + if (hint_flag && !S_ISREG(mode)) + return __this_address; + + if (inherit_flag && !S_ISDIR(mode)) + return __this_address; + + if ((hint_flag || inherit_flag) && extsize == 0) + return __this_address; + + if (!(hint_flag || inherit_flag) && extsize != 0) + return __this_address; + + if (extsize_bytes % blocksize_bytes) + return __this_address; + + if (extsize > MAXEXTLEN) + return __this_address; + + if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) + return __this_address; + + return NULL; +} + +/* + * Validate di_cowextsize hint. + * + * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). + * These functions must be kept in sync with each other. + */ +xfs_failaddr_t +xfs_inode_validate_cowextsize( + struct xfs_mount *mp, + uint32_t cowextsize, + uint16_t mode, + uint16_t flags, + uint64_t flags2) +{ + bool rt_flag; + bool hint_flag; + uint32_t cowextsize_bytes; + + rt_flag = (flags & XFS_DIFLAG_REALTIME); + hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); + cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); + + if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) + return __this_address; + + if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) + return __this_address; + + if (hint_flag && cowextsize == 0) + return __this_address; + + if (!hint_flag && cowextsize != 0) + return __this_address; + + if (hint_flag && rt_flag) + return __this_address; + + if (cowextsize_bytes % mp->m_sb.sb_blocksize) + return __this_address; + + if (cowextsize > MAXEXTLEN) + return __this_address; + + if (cowextsize > mp->m_sb.sb_agblocks / 2) + return __this_address; + + return NULL; +} diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 8a5e1da52d74..d9a376a78ee2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -84,5 +84,10 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); xfs_failaddr_t xfs_dinode_verify(struct xfs_mount *mp, xfs_ino_t ino, struct xfs_dinode *dip); +xfs_failaddr_t xfs_inode_validate_extsize(struct xfs_mount *mp, + uint32_t extsize, uint16_t mode, uint16_t flags); +xfs_failaddr_t xfs_inode_validate_cowextsize(struct xfs_mount *mp, + uint32_t cowextsize, uint16_t mode, uint16_t flags, + uint64_t flags2); #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 9be1655e7e9f..df14930e4fc5 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -89,12 +89,7 @@ out: /* Inode core */ -/* - * Validate di_extsize hint. - * - * The rules are documented at xfs_ioctl_setattr_check_extsize(). - * These functions must be kept in sync with each other. - */ +/* Validate di_extsize hint. */ STATIC void xfs_scrub_inode_extsize( struct xfs_scrub_context *sc, @@ -103,52 +98,12 @@ xfs_scrub_inode_extsize( uint16_t mode, uint16_t flags) { - struct xfs_mount *mp = sc->mp; - bool rt_flag; - bool hint_flag; - bool inherit_flag; - uint32_t extsize; - uint32_t extsize_bytes; - uint32_t blocksize_bytes; - - rt_flag = (flags & XFS_DIFLAG_REALTIME); - hint_flag = (flags & XFS_DIFLAG_EXTSIZE); - inherit_flag = (flags & XFS_DIFLAG_EXTSZINHERIT); - extsize = be32_to_cpu(dip->di_extsize); - extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize); - - if (rt_flag) - blocksize_bytes = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; - else - blocksize_bytes = mp->m_sb.sb_blocksize; - - if ((hint_flag || inherit_flag) && !(S_ISDIR(mode) || S_ISREG(mode))) - goto bad; - - if (hint_flag && !S_ISREG(mode)) - goto bad; - - if (inherit_flag && !S_ISDIR(mode)) - goto bad; + xfs_failaddr_t fa; - if ((hint_flag || inherit_flag) && extsize == 0) - goto bad; - - if (!(hint_flag || inherit_flag) && extsize != 0) - goto bad; - - if (extsize_bytes % blocksize_bytes) - goto bad; - - if (extsize > MAXEXTLEN) - goto bad; - - if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2) - goto bad; - - return; -bad: - xfs_scrub_ino_set_corrupt(sc, ino); + fa = xfs_inode_validate_extsize(sc->mp, be32_to_cpu(dip->di_extsize), + mode, flags); + if (fa) + xfs_scrub_ino_set_corrupt(sc, ino); } /* @@ -166,44 +121,13 @@ xfs_scrub_inode_cowextsize( uint16_t flags, uint64_t flags2) { - struct xfs_mount *mp = sc->mp; - bool rt_flag; - bool hint_flag; - uint32_t extsize; - uint32_t extsize_bytes; - - rt_flag = (flags & XFS_DIFLAG_REALTIME); - hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); - extsize = be32_to_cpu(dip->di_cowextsize); - extsize_bytes = XFS_FSB_TO_B(sc->mp, extsize); - - if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) - goto bad; - - if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) - goto bad; - - if (hint_flag && extsize == 0) - goto bad; - - if (!hint_flag && extsize != 0) - goto bad; + xfs_failaddr_t fa; - if (hint_flag && rt_flag) - goto bad; - - if (extsize_bytes % mp->m_sb.sb_blocksize) - goto bad; - - if (extsize > MAXEXTLEN) - goto bad; - - if (extsize > mp->m_sb.sb_agblocks / 2) - goto bad; - - return; -bad: - xfs_scrub_ino_set_corrupt(sc, ino); + fa = xfs_inode_validate_cowextsize(sc->mp, + be32_to_cpu(dip->di_cowextsize), mode, flags, + flags2); + if (fa) + xfs_scrub_ino_set_corrupt(sc, ino); } /* Make sure the di_flags make sense for the inode. */ -- cgit v1.2.3 From 6a96c5650568a2218712d43ec16f3f82296a6c53 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:56 -0700 Subject: xfs: don't accept inode buffers with suspicious unlinked chains When we're verifying inode buffers, sanity-check the unlinked pointer. We don't want to run the risk of trying to purge something that's obviously broken. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_inode_buf.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index cdd4c1d58e80..f0cfcd79ac39 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -93,20 +93,26 @@ xfs_inode_buf_verify( bool readahead) { struct xfs_mount *mp = bp->b_target->bt_mount; + xfs_agnumber_t agno; int i; int ni; /* * Validate the magic number and version of every inode in the buffer */ + agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { int di_ok; xfs_dinode_t *dip; + xfs_agino_t unlinked_ino; dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); + unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && - xfs_dinode_good_version(mp, dip->di_version); + xfs_dinode_good_version(mp, dip->di_version) && + (unlinked_ino == NULLAGINO || + xfs_verify_agino(mp, agno, unlinked_ino)); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { if (readahead) { @@ -124,9 +130,9 @@ xfs_inode_buf_verify( xfs_buf_verifier_error(bp, -EFSCORRUPTED, __func__, dip, sizeof(*dip), NULL); + return; } } - xfs_inobp_check(mp, bp); } -- cgit v1.2.3 From 5927268f5a04c3b25683e52a0017aa5a9e8eee70 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:56 -0700 Subject: xfs: flag inode corruption if parent ptr doesn't get us a real inode If a directory's parent inode pointer doesn't point to an inode, the directory should be flagged as corrupt. Enable IGET_UNTRUSTED here so that _iget will return -EINVAL if the inobt does not confirm that the inode is present and allocated and we can flag the directory corruption. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/parent.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 0d3851410c74..1fb88c18d455 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -167,8 +167,18 @@ xfs_scrub_parent_validate( * if the parent pointer erroneously points to a file, we * can't use DONTCACHE here because DONTCACHE inodes can trigger * immediate inactive cleanup of the inode. + * + * If _iget returns -EINVAL then the parent inode number is garbage + * and the directory is corrupt. If the _iget returns -EFSCORRUPTED + * or -EFSBADCRC then the parent is corrupt which is a cross + * referencing error. Any other error is an operational error. */ - error = xfs_iget(mp, sc->tp, dnum, 0, 0, &dp); + error = xfs_iget(mp, sc->tp, dnum, XFS_IGET_UNTRUSTED, 0, &dp); + if (error == -EINVAL) { + error = -EFSCORRUPTED; + xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error); + goto out; + } if (!xfs_scrub_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; if (dp == sc->ip || !S_ISDIR(VFS_I(dp)->i_mode)) { -- cgit v1.2.3 From b83e4c3ced2bc08a5e0d3b796de4e795e342d8b6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 23 Mar 2018 10:06:57 -0700 Subject: xfs: xfs_scrub_iallocbt_xref_rmap_inodes should use xref_set_corrupt In xfs_scrub_iallocbt_xref_rmap_inodes we're checking inodes against rmap records, so we should use xfs_scrub_btree_xref_set_corrupt if we encounter discrepancies here so that we know that it's a cross referencing error, not necessarily a corruption in the inobt itself. The userspace xfs_scrub program will try to repair outright corruptions in the agi/inobt prior to phase 3 so that the inode scan will proceed. If only a cross-referencing error is noted, the repair program defers the repair attempt until it can check the other space metadata at least once. It is therefore essential that the inobt scrubber can correctly distinguish between corruptions and "unable to cross-reference something else with this inobt". The same reasoning applies to "xfs: record inode buf errors as a xref error in inobt scrubber". Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/scrub/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 32e0d1ae0056..106ca4bd753f 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -434,7 +434,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes( if (!xfs_scrub_should_check_xref(sc, &error, &sc->sa.rmap_cur)) return; if (blocks != inode_blocks) - xfs_scrub_btree_set_corrupt(sc, sc->sa.ino_cur, 0); + xfs_scrub_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0); } /* Scrub the inode btrees for some AG. */ -- cgit v1.2.3 From ee457001ed6c6f31ddad69c24c1da8f377d8472d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 23 Mar 2018 10:22:53 -0700 Subject: xfs: catch inode allocation state mismatch corruption We recently came across a V4 filesystem causing memory corruption due to a newly allocated inode being setup twice and being added to the superblock inode list twice. From code inspection, the only way this could happen is if a newly allocated inode was not marked as free on disk (i.e. di_mode wasn't zero). Running the metadump on an upstream debug kernel fails during inode allocation like so: XFS: Assertion failed: ip->i_d.di_nblocks == 0, file: fs/xfs/xfs_inod= e.c, line: 838 ------------[ cut here ]------------ kernel BUG at fs/xfs/xfs_message.c:114! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 11 PID: 3496 Comm: mkdir Not tainted 4.16.0-rc5-dgc #442 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/0= 1/2014 RIP: 0010:assfail+0x28/0x30 RSP: 0018:ffffc9000236fc80 EFLAGS: 00010202 RAX: 00000000ffffffea RBX: 0000000000004000 RCX: 0000000000000000 RDX: 00000000ffffffc0 RSI: 000000000000000a RDI: ffffffff8227211b RBP: ffffc9000236fce8 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000bec R11: f000000000000000 R12: ffffc9000236fd30 R13: ffff8805c76bab80 R14: ffff8805c77ac800 R15: ffff88083fb12e10 FS: 00007fac8cbff040(0000) GS:ffff88083fd00000(0000) knlGS:0000000000000= 000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fffa6783ff8 CR3: 00000005c6e2b003 CR4: 00000000000606e0 Call Trace: xfs_ialloc+0x383/0x570 xfs_dir_ialloc+0x6a/0x2a0 xfs_create+0x412/0x670 xfs_generic_create+0x1f7/0x2c0 ? capable_wrt_inode_uidgid+0x3f/0x50 vfs_mkdir+0xfb/0x1b0 SyS_mkdir+0xcf/0xf0 do_syscall_64+0x73/0x1a0 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Extracting the inode number we crashed on from an event trace and looking at it with xfs_db: xfs_db> inode 184452204 xfs_db> p core.magic = 0x494e core.mode = 0100644 core.version = 2 core.format = 2 (extents) core.nlinkv2 = 1 core.onlink = 0 ..... Confirms that it is not a free inode on disk. xfs_repair also trips over this inode: ..... zero length extent (off = 0, fsbno = 0) in ino 184452204 correcting nextents for inode 184452204 bad attribute fork in inode 184452204, would clear attr fork bad nblocks 1 for inode 184452204, would reset to 0 bad anextents 1 for inode 184452204, would reset to 0 imap claims in-use inode 184452204 is free, would correct imap would have cleared inode 184452204 ..... disconnected inode 184452204, would move to lost+found And so we have a situation where the directory structure and the inobt thinks the inode is free, but the inode on disk thinks it is still in use. Where this corruption came from is not possible to diagnose, but we can detect it and prevent the kernel from oopsing on lookup. The reproducer now results in: $ sudo mkdir /mnt/scratch/{0,1,2,3,4,5}{0,1,2,3,4,5} mkdir: cannot create directory =E2=80=98/mnt/scratch/00=E2=80=99: File ex= ists mkdir: cannot create directory =E2=80=98/mnt/scratch/01=E2=80=99: File ex= ists mkdir: cannot create directory =E2=80=98/mnt/scratch/03=E2=80=99: Structu= re needs cleaning mkdir: cannot create directory =E2=80=98/mnt/scratch/04=E2=80=99: Input/o= utput error mkdir: cannot create directory =E2=80=98/mnt/scratch/05=E2=80=99: Input/o= utput error .... And this corruption shutdown: [ 54.843517] XFS (loop0): Corruption detected! Free inode 0xafe846c not= marked free on disk [ 54.845885] XFS (loop0): Internal error xfs_trans_cancel at line 1023 = of file fs/xfs/xfs_trans.c. Caller xfs_create+0x425/0x670 [ 54.848994] CPU: 10 PID: 3541 Comm: mkdir Not tainted 4.16.0-rc5-dgc #= 443 [ 54.850753] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIO= S 1.10.2-1 04/01/2014 [ 54.852859] Call Trace: [ 54.853531] dump_stack+0x85/0xc5 [ 54.854385] xfs_trans_cancel+0x197/0x1c0 [ 54.855421] xfs_create+0x425/0x670 [ 54.856314] xfs_generic_create+0x1f7/0x2c0 [ 54.857390] ? capable_wrt_inode_uidgid+0x3f/0x50 [ 54.858586] vfs_mkdir+0xfb/0x1b0 [ 54.859458] SyS_mkdir+0xcf/0xf0 [ 54.860254] do_syscall_64+0x73/0x1a0 [ 54.861193] entry_SYSCALL_64_after_hwframe+0x42/0xb7 [ 54.862492] RIP: 0033:0x7fb73bddf547 [ 54.863358] RSP: 002b:00007ffdaa553338 EFLAGS: 00000246 ORIG_RAX: 0000= 000000000053 [ 54.865133] RAX: ffffffffffffffda RBX: 00007ffdaa55449a RCX: 00007fb73= bddf547 [ 54.866766] RDX: 0000000000000001 RSI: 00000000000001ff RDI: 00007ffda= a55449a [ 54.868432] RBP: 00007ffdaa55449a R08: 00000000000001ff R09: 00005623a= 8670dd0 [ 54.870110] R10: 00007fb73be72d5b R11: 0000000000000246 R12: 000000000= 00001ff [ 54.871752] R13: 00007ffdaa5534b0 R14: 0000000000000000 R15: 00007ffda= a553500 [ 54.873429] XFS (loop0): xfs_do_force_shutdown(0x8) called from line 1= 024 of file fs/xfs/xfs_trans.c. Return address = ffffffff814cd050 [ 54.882790] XFS (loop0): Corruption of in-memory data detected. Shutt= ing down filesystem [ 54.884597] XFS (loop0): Please umount the filesystem and rectify the = problem(s) Note that this crash is only possible on v4 filesystemsi or v5 filesystems mounted with the ikeep mount option. For all other V5 filesystems, this problem cannot occur because we don't read inodes we are allocating from disk - we simply overwrite them with the new inode information. Signed-Off-By: Dave Chinner Reviewed-by: Carlos Maiolino Tested-by: Carlos Maiolino Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index d53a316162d6..9a18f69f6e96 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -483,7 +483,28 @@ xfs_iget_cache_miss( trace_xfs_iget_miss(ip); - if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) { + + /* + * If we are allocating a new inode, then check what was returned is + * actually a free, empty inode. If we are not allocating an inode, + * the check we didn't find a free inode. + */ + if (flags & XFS_IGET_CREATE) { + if (VFS_I(ip)->i_mode != 0) { + xfs_warn(mp, +"Corruption detected! Free inode 0x%llx not marked free on disk", + ino); + error = -EFSCORRUPTED; + goto out_destroy; + } + if (ip->i_d.di_nblocks != 0) { + xfs_warn(mp, +"Corruption detected! Free inode 0x%llx has blocks allocated!", + ino); + error = -EFSCORRUPTED; + goto out_destroy; + } + } else if (VFS_I(ip)->i_mode == 0) { error = -ENOENT; goto out_destroy; } -- cgit v1.2.3 From fa4493f0d9b3ac8f36743f1a26e2318b449ee4c8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 23 Mar 2018 10:22:54 -0700 Subject: xfs: remove dead inode version setting code We can only get into the branch if CRCs are enabled, so there's no need to check inside the branch for CRCs being enabled.... Signed-Off-By: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_inode_buf.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f0cfcd79ac39..ef68b1de006a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -572,10 +572,7 @@ xfs_iread( /* initialise the on-disk inode core */ memset(&ip->i_d, 0, sizeof(ip->i_d)); VFS_I(ip)->i_generation = prandom_u32(); - if (xfs_sb_version_hascrc(&mp->m_sb)) - ip->i_d.di_version = 3; - else - ip->i_d.di_version = 2; + ip->i_d.di_version = 3; return 0; } -- cgit v1.2.3 From 72c44e35f02a1cb4032e476c398a7234badcf49f Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 23 Mar 2018 17:54:32 -0700 Subject: xfs: clean up xfs_mount allocation and dynamic initializers Most of the generic data structures embedded in xfs_mount are dynamically initialized immediately after mp is allocated. A few fields are left out and initialized during the xfs_mountfs() sequence, after mp has been attached to the superblock. To clean this up and help prevent premature access of associated fields, refactor xfs_mount allocation and all dependent init calls into a new helper. This self-documents that all low level data structures (i.e., locks, trees, etc.) should be initialized before xfs_mount is attached to the superblock. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_sb.c | 1 - fs/xfs/xfs_mount.c | 2 -- fs/xfs/xfs_super.c | 39 +++++++++++++++++++++++++++++---------- 3 files changed, 29 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a55f7a45fa78..53433cc024fd 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -731,7 +731,6 @@ xfs_sb_mount_common( struct xfs_sb *sbp) { mp->m_agfrotor = mp->m_agirotor = 0; - spin_lock_init(&mp->m_agirotor_lock); mp->m_maxagi = mp->m_sb.sb_agcount; mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6f5a5e6764d8..a901b86772f8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -817,8 +817,6 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - spin_lock_init(&mp->m_perag_lock); - INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); if (error) { xfs_warn(mp, "Failed per-ag init: %d", error); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 951271f57d00..612c1d5348b3 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1579,29 +1579,48 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_fdblocks); } -STATIC int -xfs_fs_fill_super( - struct super_block *sb, - void *data, - int silent) +static struct xfs_mount * +xfs_mount_alloc( + struct super_block *sb) { - struct inode *root; - struct xfs_mount *mp = NULL; - int flags = 0, error = -ENOMEM; + struct xfs_mount *mp; mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); if (!mp) - goto out; + return NULL; + mp->m_super = sb; spin_lock_init(&mp->m_sb_lock); + spin_lock_init(&mp->m_agirotor_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker); mp->m_kobj.kobject.kset = xfs_kset; + return mp; +} - mp->m_super = sb; + +STATIC int +xfs_fs_fill_super( + struct super_block *sb, + void *data, + int silent) +{ + struct inode *root; + struct xfs_mount *mp = NULL; + int flags = 0, error = -ENOMEM; + + /* + * allocate mp and do all low-level struct initializations before we + * attach it to the super + */ + mp = xfs_mount_alloc(sb); + if (!mp) + goto out; sb->s_fs_info = mp; error = xfs_parseargs(mp, (char *)data); -- cgit v1.2.3 From dc1baa715bbfbb1902da942d06497e79b40e7bc7 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 28 Mar 2018 17:48:08 -0700 Subject: xfs: do not log/recover swapext extent owner changes for deleted inodes Today if we run xfs_fsr and crash[1], log replay can fail because the recovery code tries to instantiate the donor inode from disk to replay the swapext, but it's been deleted and we get verifier failures when we try to read the inode off disk with i_mode == 0. This fixes both sides: We don't log the swapext change if the inode has been deleted, and we don't try to recover it either. [1] or if systemd doesn't cleanly unmount root, as it is wont to do ... Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 4 ++++ fs/xfs/xfs_log_recover.c | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 09ba970c07b1..3e3aab3888fa 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2475,6 +2475,10 @@ xfs_ifree( ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + + /* Don't attempt to replay owner changes for a deleted inode */ + ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER); + /* * Bump the generation count so no one will be confused * by reincarnations of this inode. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 59134f626927..2b2383f1895e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3245,7 +3245,9 @@ xlog_recover_inode_pass2( } out_owner_change: - if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + /* Recover the swapext owner change unless inode has been deleted */ + if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && + (dip->di_mode != 0)) error = xfs_recover_inode_owner_change(mp, dip, in_f, buffer_list); /* re-generate the checksum. */ -- cgit v1.2.3