From 6b10b23ca94451fae153a5cc8d62fd721bec2019 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:06 +1100 Subject: xfs: set AGI buffer type in xlog_recover_clear_agi_bucket xlog_recover_clear_agi_bucket didn't set the type to XFS_BLFT_AGI_BUF, so we got a warning during log replay (or an ASSERT on a debug build). XFS (md0): Unknown buffer type 0! XFS (md0): _xfs_buf_ioapply: no ops on block 0xaea8802/0x1 Fix this, as was done in f19b872b for 2 other locations with the same problem. cc: # 3.10 to current Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d..2d91f5ab7538 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4929,6 +4929,7 @@ xlog_recover_clear_agi_bucket( agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); + xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); -- cgit v1.2.3 From 200237d6746faaeaf7f4ff4abbf13f3917cee60a Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:31 +1100 Subject: xfs: Move AGI buffer type setting to xfs_read_agi We've missed properly setting the buffer type for an AGI transaction in 3 spots now, so just move it into xfs_read_agi() and set it if we are in a transaction to avoid the problem in the future. This is similar to how it is done in i.e. the dir3 and attr3 read functions. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_ialloc.c | 4 ++-- fs/xfs/xfs_inode.c | 2 -- fs/xfs/xfs_log_recover.c | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 51b4e0de1fdc..c482b9716347 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2450,8 +2450,6 @@ xfs_ialloc_log_agi( ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); - /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through @@ -2592,6 +2590,8 @@ xfs_read_agi( XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; + if (tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4e560e6a12c1..512ff13ed66a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2041,7 +2041,6 @@ xfs_iunlink( agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); return 0; @@ -2133,7 +2132,6 @@ xfs_iunlink_remove( agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); } else { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 2d91f5ab7538..9b3d7c76915d 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4929,7 +4929,6 @@ xlog_recover_clear_agi_bucket( agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); -- cgit v1.2.3 From 7710517fc37b1899722707883b54694ea710b3c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:31:50 +1100 Subject: xfs: pass state not whichfork to trace_xfs_extlist When xfs_bmap_trace_exlist called trace_xfs_extlist, it sent in the "whichfork" var instead of the bmap "state" as expected (even though state was already set up for this purpose). As a result, the xfs_bmap_class in tracing code used "whichfork" not state in xfs_iext_state_to_fork(), and got the wrong ifork pointer. It all goes downhill from there, including an ASSERT when ifp_bytes is empty by the time it reaches xfs_iext_get_ext(): XFS: Assertion failed: idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 6b7e6eb29414..e4120fcefcc8 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -531,7 +531,7 @@ xfs_bmap_trace_exlist( ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(cnt == xfs_iext_count(ifp)); for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); + trace_xfs_extlist(ip, idx, state, caller_ip); } /* -- cgit v1.2.3 From c44a1f22626c153976289e1cd67bdcdfefc16e1f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:32:00 +1100 Subject: xfs: handle cow fork in xfs_bmap_trace_exlist By inspection, xfs_bmap_trace_exlist isn't handling cow forks, and will trace the data fork instead. Fix this by setting state appropriately if whichfork == XFS_COW_FORK. ()___() < @ @ > | | {o_o} (|) Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index e4120fcefcc8..23aa70b2790c 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -518,7 +518,7 @@ void xfs_bmap_trace_exlist( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ + int whichfork, /* data or attr or cow fork */ unsigned long caller_ip) { xfs_extnum_t idx; /* extent record index */ @@ -527,6 +527,8 @@ xfs_bmap_trace_exlist( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(cnt == xfs_iext_count(ifp)); -- cgit v1.2.3 From f7a136aee3c1c3f7daf87197b3b3c361744a2812 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 5 Dec 2016 12:32:14 +1100 Subject: xfs: several xattr functions can be void There are a handful of xattr functions which now return nothing but zero. They can be made void, chased through calling functions, and error handling etc can be removed. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- fs/xfs/xfs_attr.h | 4 +-- fs/xfs/xfs_attr_list.c | 59 ++++++++++++++++--------------------------- fs/xfs/xfs_xattr.c | 23 ++++++++--------- 4 files changed, 35 insertions(+), 53 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 4f2aed04f827..91f51637f8af 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -77,7 +77,7 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr3_leaf_list_int(struct xfs_buf *bp, +void xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e3da5d448bcf..d14691aa02b4 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ -/* Return 0 on success, or -errno; other state communicated via *context */ -typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, +/* void; state communicated via *context */ +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); typedef struct xfs_attr_list_context { diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 25e76cd6c053..97c45b6eb91e 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_sf_entry_t *sfe; xfs_inode_t *dp; int sbsize, nsbuf, count, i; - int error; ASSERT(context != NULL); dp = context->dp; @@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (XFS_ISRESET_CURSOR(cursor) && (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - error = context->put_listent(context, - sfe->flags, - sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen); - if (error) - return error; + context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen); /* * Either search callback finished early or * didn't fit it all in the buffer after all. @@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor->hashval = sbp->hash; cursor->offset = 0; } - error = context->put_listent(context, - sbp->flags, - sbp->name, - sbp->namelen, - sbp->valuelen); - if (error) { - kmem_free(sbuf); - return error; - } + context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen); if (context->seen_enough) break; cursor->offset++; @@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - error = xfs_attr3_leaf_list_int(bp, context); - if (error) { - xfs_trans_brelse(NULL, bp); - return error; - } + xfs_attr3_leaf_list_int(bp, context); xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; @@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ -int +void xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) @@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int( struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_entry *entry; - int retval; int i; struct xfs_mount *mp = context->dp->i_mount; @@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int( } if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return 0; + return; } } else { entry = &entries[0]; @@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int( /* * We have found our place, start copying out the new attributes. */ - retval = 0; for (; i < ichdr.count; entry++, i++) { char *name; int namelen, valuelen; @@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int( valuelen = be32_to_cpu(name_rmt->valuelen); } - retval = context->put_listent(context, entry->flags, + context->put_listent(context, entry->flags, name, namelen, valuelen); - if (retval) - break; if (context->seen_enough) break; cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return retval; + return; } /* @@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) if (error) return error; - error = xfs_attr3_leaf_list_int(bp, context); + xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); - return error; + return 0; } int @@ -513,7 +498,7 @@ xfs_attr_list_int( * Take care to check values and protect against them changing later, * we may be reading them directly out of a user buffer. */ -STATIC int +STATIC void xfs_attr_put_listent( xfs_attr_list_context_t *context, int flags, @@ -536,10 +521,10 @@ xfs_attr_put_listent( */ if (((context->flags & ATTR_SECURE) == 0) != ((flags & XFS_ATTR_SECURE) == 0)) - return 0; + return; if (((context->flags & ATTR_ROOT) == 0) != ((flags & XFS_ATTR_ROOT) == 0)) - return 0; + return; arraytop = sizeof(*alist) + context->count * sizeof(alist->al_offset[0]); @@ -548,7 +533,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 0; + return; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; @@ -558,7 +543,7 @@ xfs_attr_put_listent( alist->al_offset[context->count++] = context->firstu; alist->al_count = context->count; trace_xfs_attr_list_add(context); - return 0; + return; } /* diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 62900938f26d..0594db435972 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = { NULL }; -static int +static void __xfs_xattr_put_listent( struct xfs_attr_list_context *context, char *prefix, @@ -148,7 +148,7 @@ __xfs_xattr_put_listent( if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ context->seen_enough = 1; - return 0; + return; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -159,10 +159,10 @@ __xfs_xattr_put_listent( compute_size: context->count += prefix_len + namelen + 1; - return 0; + return; } -static int +static void xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, @@ -180,23 +180,19 @@ xfs_xattr_put_listent( if (namelen == SGI_ACL_FILE_SIZE && strncmp(name, SGI_ACL_FILE, SGI_ACL_FILE_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_ACCESS, strlen(XATTR_POSIX_ACL_ACCESS)); - if (ret) - return ret; } else if (namelen == SGI_ACL_DEFAULT_SIZE && strncmp(name, SGI_ACL_DEFAULT, SGI_ACL_DEFAULT_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_DEFAULT, strlen(XATTR_POSIX_ACL_DEFAULT)); - if (ret) - return ret; } #endif @@ -205,7 +201,7 @@ xfs_xattr_put_listent( * see them. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return; prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; @@ -217,8 +213,9 @@ xfs_xattr_put_listent( prefix_len = XATTR_USER_PREFIX_LEN; } - return __xfs_xattr_put_listent(context, prefix, prefix_len, name, - namelen); + __xfs_xattr_put_listent(context, prefix, prefix_len, name, + namelen); + return; } ssize_t -- cgit v1.2.3 From d2a047f31e86941fa896e0e3271536d50aba415e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:32:50 +1100 Subject: xfs: forbid AG btrees with level == 0 There is no such thing as a zero-level AG btree since even a single-node zero-records btree has one level. Btree cursor constructors read cur_nlevels straight from disk and then access things like cur_bufs[cur_nlevels - 1] which is /really/ bad if cur_nlevels is zero! Therefore, strengthen the verifiers to prevent this possibility. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 10 +++++++--- fs/xfs/libxfs/xfs_ialloc.c | 9 ++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index effb64cf714f..5050056a0b06 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2455,12 +2455,15 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) return false; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return false; /* @@ -2477,7 +2480,8 @@ xfs_agf_verify( return false; if (xfs_sb_version_hasreflink(&mp->m_sb) && - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return false; return true;; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index c482b9716347..d45c03779dae 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2510,8 +2510,15 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return false; - if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + if (be32_to_cpu(agi->agi_level) < 1 || + be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) return false; + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + (be32_to_cpu(agi->agi_free_level) < 1 || + be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) + return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't -- cgit v1.2.3 From bb3be7e7c1c18e1b141d4cadeb98cc89ecf78099 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:33:54 +1100 Subject: xfs: check for bogus values in btree block headers When we're reading a btree block, make sure that what we retrieved matches the owner and level; and has a plausible number of records. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0e80993c8a59..21e6a6ab6b9a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block( if (error) return error; + /* Check the inode owner since the verifiers don't. */ + if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + (cur->bc_flags & XFS_BTREE_LONG_PTRS) && + be64_to_cpu((*blkp)->bb_u.l.bb_owner) != + cur->bc_private.b.ip->i_ino) + goto out_bad; + + /* Did we get the level we were looking for? */ + if (be16_to_cpu((*blkp)->bb_level) != level) + goto out_bad; + + /* Check that internal nodes have at least one record. */ + if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) + goto out_bad; + xfs_btree_setbuf(cur, level, bp); return 0; + +out_bad: + *blkp = NULL; + xfs_trans_brelse(cur->bc_tp, bp); + return -EFSCORRUPTED; } /* -- cgit v1.2.3 From 356a3225222e5bc4df88aef3419fb6424f18ab69 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:36:56 +1100 Subject: xfs: complain if we don't get nextents bmap records When reading into memory all extents of a btree-format inode fork, complain if the number of extents we find is not the same as the number of extents reported in the inode core. This is needed to stop an IO action from accessing the garbage areas of the in-core fork. [dchinner: removed redundant assert] Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 23aa70b2790c..829ad632533b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1377,8 +1377,9 @@ xfs_bmap_read_extents( return error; block = XFS_BUF_TO_BLOCK(bp); } + if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) + return -EFSCORRUPTED; ASSERT(i == xfs_iext_count(ifp)); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; error0: -- cgit v1.2.3 From 96a3aefb8ffde23180130460b0b2407b328eb727 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:37:47 +1100 Subject: xfs: don't crash if reading a directory results in an unexpected hole In xfs_dir3_data_read, we can encounter the situation where err == 0 and *bpp == NULL if the given bno offset happens to be a hole; this leads to a crash if we try to set the buffer type after the _da_read_buf call. Holes can happen due to corrupt or malicious entries in the bmbt data, so be a little more careful when we're handling buffers. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_dir2_data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 725fc7841fde..e526f5a5f0be 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -329,7 +329,7 @@ xfs_dir3_data_read( err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } -- cgit v1.2.3 From 0f352f8ee8412bd9d34fb2a6411241da61175c0e Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:38:11 +1100 Subject: xfs: error out if trying to add attrs and anextents > 0 We shouldn't assert if somehow we end up trying to add an attr fork to an inode that apparently already has attr extents because this is an indication of on-disk corruption. Instead, return an error code to userspace. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 829ad632533b..29ffc0569ce1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1153,6 +1153,10 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; + if (ip->i_d.di_anextents != 0) { + error = -EFSCORRUPTED; + goto trans_cancel; + } if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1160,7 +1164,6 @@ xfs_bmap_add_attrfork( ASSERT(ip->i_d.di_aformat == 0); ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } - ASSERT(ip->i_d.di_anextents == 0); xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); -- cgit v1.2.3 From ef388e2054feedaeb05399ed654bdb06f385d294 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:38:38 +1100 Subject: xfs: don't allow di_size with high bit set The on-disk field di_size is used to set i_size, which is a signed integer of loff_t. If the high bit of di_size is set, we'll end up with a negative i_size, which will cause all sorts of problems. Since the VFS won't let us create a file with such length, we should catch them here in the verifier too. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_inode_buf.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 134424fac434..c906e50515f0 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -392,6 +392,14 @@ xfs_dinode_verify( if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; + /* don't allow invalid i_size */ + if (be64_to_cpu(dip->di_size) & (1ULL << 63)) + return false; + + /* No zero-length symlinks. */ + if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0) + return false; + /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return true; -- cgit v1.2.3 From 1bb33a98702d8360947f18a44349df75ba555d5d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 5 Dec 2016 12:38:57 +1100 Subject: xfs: don't cap maximum dedupe request length After various discussions on linux-fsdevel, it has been decided that it is not necessary to cap the length of a dedupe request, and that correctly-written userspace client programs will be able to absorb the change. Therefore, remove the length clamping behavior. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6e4f7f900fea..9a5d64b5f35a 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -939,7 +939,6 @@ xfs_file_clone_range( len, false); } -#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) STATIC ssize_t xfs_file_dedupe_range( struct file *src_file, @@ -950,14 +949,6 @@ xfs_file_dedupe_range( { int error; - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > XFS_MAX_DEDUPE_LEN) - len = XFS_MAX_DEDUPE_LEN; - error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) -- cgit v1.2.3 From 11ef38afe98cc7ad1a46ef24945232ec1760d5e2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 5 Dec 2016 14:38:58 +1100 Subject: xfs: make xfs btree stats less huge Embedding a switch statement in every btree stats inc/add adds a lot of code overhead to the core btree infrastructure paths. Stats are supposed to be small and lightweight, but the btree stats have become big and bloated as we've added more btrees. It needs fixing because the reflink code will just add more overhead again. Convert the v2 btree stats to arrays instead of independent variables, and instead use the type to index the specific btree array via an enum. This allows us to use array based indexing to update the stats, rather than having to derefence variables specific to the btree type. If we then wrap the xfsstats structure in a union and place uint32_t array beside it, and calculate the correct btree stats array base array index when creating a btree cursor, we can easily access entries in the stats structure without having to switch names based on the btree type. We then replace with the switch statement with a simple set of stats wrapper macros, resulting in a significant simplification of the btree stats code, and: text data bss dec hex filename 48905 144 8 49057 bfa1 fs/xfs/libxfs/xfs_btree.o.old 36793 144 8 36945 9051 fs/xfs/libxfs/xfs_btree.o it reduces the core btree infrastructure code size by close to 25%! Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc_btree.c | 4 + fs/xfs/libxfs/xfs_bmap_btree.c | 1 + fs/xfs/libxfs/xfs_btree.h | 43 +------- fs/xfs/libxfs/xfs_ialloc_btree.c | 2 + fs/xfs/libxfs/xfs_refcount_btree.c | 1 + fs/xfs/libxfs/xfs_rmap_btree.c | 1 + fs/xfs/xfs_stats.c | 10 +- fs/xfs/xfs_stats.h | 200 +++++++++++++++---------------------- 8 files changed, 99 insertions(+), 163 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 5ba2dac5e67c..44cfcd03c451 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -428,6 +428,10 @@ xfs_allocbt_init_cursor( cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_allocbt_ops; + if (btnum == XFS_BTNUM_BNO) + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + else + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8007d2ba9aef..94ad31d372ab 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -803,6 +803,7 @@ xfs_bmbt_init_cursor( cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_btnum = XFS_BTNUM_BMAP; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c2b01d1c79ee..b69b947c4c1b 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -96,46 +96,10 @@ union xfs_btree_rec { /* * Generic stats interface */ -#define __XFS_BTREE_STATS_INC(mp, type, stat) \ - XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat) #define XFS_BTREE_STATS_INC(cur, stat) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ - case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ - case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) - -#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \ - XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val) -#define XFS_BTREE_STATS_ADD(cur, stat, val) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: \ - __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \ - case XFS_BTNUM_CNT: \ - __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \ - case XFS_BTNUM_BMAP: \ - __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \ - case XFS_BTNUM_INO: \ - __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ - case XFS_BTNUM_FINO: \ - __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ - case XFS_BTNUM_RMAP: \ - __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ - case XFS_BTNUM_REFC: \ - __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) + XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ + XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) #define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ @@ -253,6 +217,7 @@ typedef struct xfs_btree_cur __uint8_t bc_nlevels; /* number of levels in the tree */ __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ + int bc_statoff; /* offset of btre stats array */ union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index eab68ae2e011..e7ff8ef0e5a7 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -365,9 +365,11 @@ xfs_inobt_init_cursor( if (btnum == XFS_BTNUM_INO) { cur->bc_nlevels = be32_to_cpu(agi->agi_level); cur->bc_ops = &xfs_inobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); } else { cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); cur->bc_ops = &xfs_finobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); } cur->bc_blocklog = mp->m_sb.sb_blocklog; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 453bb2757ec2..6fb2215f8ff7 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor( cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_refcountbt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 83e672ff7577..de25771764ba 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_rmapbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 12d48cd8f8a4..f11282c96887 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) } /* extra precision counters */ for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes; - xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes; - xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes; + xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; + xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; + xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", @@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) for_each_possible_cpu(c) { preempt_disable(); /* save vn_active, it's a universal truth! */ - vn_active = per_cpu_ptr(stats, c)->vn_active; + vn_active = per_cpu_ptr(stats, c)->s.vn_active; memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); - per_cpu_ptr(stats, c)->vn_active = vn_active; + per_cpu_ptr(stats, c)->s.vn_active = vn_active; preempt_enable(); } } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 79ad2e69fc33..375840f5a99a 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -21,10 +21,38 @@ #include +/* + * The btree stats arrays have fixed offsets for the different stats. We + * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and + * that allows us to use fixed offsets into the stats array for each btree + * stat. These index offsets are defined in the order they will be emitted + * in the stats files, so it is possible to add new btree stat types by + * appending to the enum list below. + */ +enum { + __XBTS_lookup = 0, + __XBTS_compare = 1, + __XBTS_insrec = 2, + __XBTS_delrec = 3, + __XBTS_newroot = 4, + __XBTS_killroot = 5, + __XBTS_increment = 6, + __XBTS_decrement = 7, + __XBTS_lshift = 8, + __XBTS_rshift = 9, + __XBTS_split = 10, + __XBTS_join = 11, + __XBTS_alloc = 12, + __XBTS_free = 13, + __XBTS_moves = 14, + + __XBTS_MAX = 15, +}; + /* * XFS global statistics */ -struct xfsstats { +struct __xfsstats { # define XFSSTAT_END_EXTENT_ALLOC 4 __uint32_t xs_allocx; __uint32_t xs_allocb; @@ -117,118 +145,20 @@ struct xfsstats { __uint32_t xb_page_found; __uint32_t xb_get_read; /* Version 2 btree counters */ -#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) - __uint32_t xs_abtb_2_lookup; - __uint32_t xs_abtb_2_compare; - __uint32_t xs_abtb_2_insrec; - __uint32_t xs_abtb_2_delrec; - __uint32_t xs_abtb_2_newroot; - __uint32_t xs_abtb_2_killroot; - __uint32_t xs_abtb_2_increment; - __uint32_t xs_abtb_2_decrement; - __uint32_t xs_abtb_2_lshift; - __uint32_t xs_abtb_2_rshift; - __uint32_t xs_abtb_2_split; - __uint32_t xs_abtb_2_join; - __uint32_t xs_abtb_2_alloc; - __uint32_t xs_abtb_2_free; - __uint32_t xs_abtb_2_moves; -#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) - __uint32_t xs_abtc_2_lookup; - __uint32_t xs_abtc_2_compare; - __uint32_t xs_abtc_2_insrec; - __uint32_t xs_abtc_2_delrec; - __uint32_t xs_abtc_2_newroot; - __uint32_t xs_abtc_2_killroot; - __uint32_t xs_abtc_2_increment; - __uint32_t xs_abtc_2_decrement; - __uint32_t xs_abtc_2_lshift; - __uint32_t xs_abtc_2_rshift; - __uint32_t xs_abtc_2_split; - __uint32_t xs_abtc_2_join; - __uint32_t xs_abtc_2_alloc; - __uint32_t xs_abtc_2_free; - __uint32_t xs_abtc_2_moves; -#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) - __uint32_t xs_bmbt_2_lookup; - __uint32_t xs_bmbt_2_compare; - __uint32_t xs_bmbt_2_insrec; - __uint32_t xs_bmbt_2_delrec; - __uint32_t xs_bmbt_2_newroot; - __uint32_t xs_bmbt_2_killroot; - __uint32_t xs_bmbt_2_increment; - __uint32_t xs_bmbt_2_decrement; - __uint32_t xs_bmbt_2_lshift; - __uint32_t xs_bmbt_2_rshift; - __uint32_t xs_bmbt_2_split; - __uint32_t xs_bmbt_2_join; - __uint32_t xs_bmbt_2_alloc; - __uint32_t xs_bmbt_2_free; - __uint32_t xs_bmbt_2_moves; -#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) - __uint32_t xs_ibt_2_lookup; - __uint32_t xs_ibt_2_compare; - __uint32_t xs_ibt_2_insrec; - __uint32_t xs_ibt_2_delrec; - __uint32_t xs_ibt_2_newroot; - __uint32_t xs_ibt_2_killroot; - __uint32_t xs_ibt_2_increment; - __uint32_t xs_ibt_2_decrement; - __uint32_t xs_ibt_2_lshift; - __uint32_t xs_ibt_2_rshift; - __uint32_t xs_ibt_2_split; - __uint32_t xs_ibt_2_join; - __uint32_t xs_ibt_2_alloc; - __uint32_t xs_ibt_2_free; - __uint32_t xs_ibt_2_moves; -#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) - __uint32_t xs_fibt_2_lookup; - __uint32_t xs_fibt_2_compare; - __uint32_t xs_fibt_2_insrec; - __uint32_t xs_fibt_2_delrec; - __uint32_t xs_fibt_2_newroot; - __uint32_t xs_fibt_2_killroot; - __uint32_t xs_fibt_2_increment; - __uint32_t xs_fibt_2_decrement; - __uint32_t xs_fibt_2_lshift; - __uint32_t xs_fibt_2_rshift; - __uint32_t xs_fibt_2_split; - __uint32_t xs_fibt_2_join; - __uint32_t xs_fibt_2_alloc; - __uint32_t xs_fibt_2_free; - __uint32_t xs_fibt_2_moves; -#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15) - __uint32_t xs_rmap_2_lookup; - __uint32_t xs_rmap_2_compare; - __uint32_t xs_rmap_2_insrec; - __uint32_t xs_rmap_2_delrec; - __uint32_t xs_rmap_2_newroot; - __uint32_t xs_rmap_2_killroot; - __uint32_t xs_rmap_2_increment; - __uint32_t xs_rmap_2_decrement; - __uint32_t xs_rmap_2_lshift; - __uint32_t xs_rmap_2_rshift; - __uint32_t xs_rmap_2_split; - __uint32_t xs_rmap_2_join; - __uint32_t xs_rmap_2_alloc; - __uint32_t xs_rmap_2_free; - __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) - __uint32_t xs_refcbt_2_lookup; - __uint32_t xs_refcbt_2_compare; - __uint32_t xs_refcbt_2_insrec; - __uint32_t xs_refcbt_2_delrec; - __uint32_t xs_refcbt_2_newroot; - __uint32_t xs_refcbt_2_killroot; - __uint32_t xs_refcbt_2_increment; - __uint32_t xs_refcbt_2_decrement; - __uint32_t xs_refcbt_2_lshift; - __uint32_t xs_refcbt_2_rshift; - __uint32_t xs_refcbt_2_split; - __uint32_t xs_refcbt_2_join; - __uint32_t xs_refcbt_2_alloc; - __uint32_t xs_refcbt_2_free; - __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) + __uint32_t xs_abtb_2[__XBTS_MAX]; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) + __uint32_t xs_abtc_2[__XBTS_MAX]; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) + __uint32_t xs_bmbt_2[__XBTS_MAX]; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) + __uint32_t xs_ibt_2[__XBTS_MAX]; +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) + __uint32_t xs_fibt_2[__XBTS_MAX]; +#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) + __uint32_t xs_rmap_2[__XBTS_MAX]; +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) + __uint32_t xs_refcbt_2[__XBTS_MAX]; #define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; @@ -245,26 +175,58 @@ struct xfsstats { __uint64_t xs_read_bytes; }; +struct xfsstats { + union { + struct __xfsstats s; + uint32_t a[XFSSTAT_END_XQMSTAT]; + }; +}; + +/* + * simple wrapper for getting the array index of s struct member offset + */ +#define XFS_STATS_CALC_INDEX(member) \ + (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t)) + + int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); void xfs_stats_clearall(struct xfsstats __percpu *stats); extern struct xstats xfsstats; #define XFS_STATS_INC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++; \ } while (0) #define XFS_STATS_DEC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--; \ } while (0) #define XFS_STATS_ADD(mp, v, inc) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc); \ +} while (0) + +#define XFS_STATS_INC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++; \ +} while (0) + +#define XFS_STATS_DEC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]; \ +} while (0) + +#define XFS_STATS_ADD_OFF(mp, off, inc) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc); \ } while (0) #if defined(CONFIG_PROC_FS) -- cgit v1.2.3 From cae028df53449905c944603df624ac94bc619661 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 5 Dec 2016 14:40:32 +1100 Subject: xfs: optimise CRC updates Nick Piggin reported that the CRC overhead in an fsync heavy workload was higher than expected on a Power8 machine. Part of this was to do with the fact that the power8 CRC implementation is not efficient for CRC lengths of less than 512 bytes, and so the way we split the CRCs over the CRC field means a lot of the CRCs are reduced to being less than than optimal size. To optimise this, change the CRC update mechanism to zero the CRC field first, and then compute the CRC in one pass over the buffer and write the result back into the buffer. We can do this safely because anything writing a CRC has exclusive access to the buffer the CRC is being calculated over. We leave the CRC verify code the same - it still splits the CRC calculation - because we do not want read-only operations modifying the underlying buffer. This is because read-only operations may not have an exclusive access to the buffer guaranteed, and so temporary modifications could leak out to to other processes accessing the buffer concurrently. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_cksum.h | 26 ++++++++++++++++++++++---- fs/xfs/libxfs/xfs_inode_buf.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log_recover.c | 12 +++++++----- 4 files changed, 31 insertions(+), 11 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index fad1676ad8cd..a416c7cb23ea 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -6,10 +6,11 @@ /* * Calculate the intermediate checksum for a buffer that has the CRC field * inside it. The offset of the 32bit crc fields is passed as the - * cksum_offset parameter. + * cksum_offset parameter. We do not modify the buffer during verification, + * hence we have to split the CRC calculation across the cksum_offset. */ static inline __uint32_t -xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) { __uint32_t zero = 0; __uint32_t crc; @@ -25,6 +26,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) length - (cksum_offset + sizeof(__be32))); } +/* + * Fast CRC method where the buffer is modified. Callers must have exclusive + * access to the buffer while the calculation takes place. + */ +static inline __uint32_t +xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) +{ + /* zero the CRC field */ + *(__le32 *)(buffer + cksum_offset) = 0; + + /* single pass CRC calculation for the entire buffer */ + return crc32c(XFS_CRC_SEED, buffer, length); +} + /* * Convert the intermediate checksum to the final ondisk format. * @@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc) /* * Helper to generate the checksum for a buffer. + * + * This modifies the buffer temporarily - callers must have exclusive + * access to the buffer while the calculation takes place. */ static inline void xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); } @@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) static inline int xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index c906e50515f0..9004665d679a 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -444,7 +444,7 @@ xfs_dinode_calc_crc( return; ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); - crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3b74fa011bb1..3ebe444eb60f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1668,7 +1668,7 @@ xlog_cksum( __uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum((char *)rhead, + crc = xfs_start_cksum_update((char *)rhead, sizeof(struct xlog_rec_header), offsetof(struct xlog_rec_header, h_crc)); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d..56b7a2f6aaf2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -5113,19 +5113,21 @@ xlog_recover_process( struct list_head *buffer_list) { int error; + __le32 old_crc = rhead->h_crc; __le32 crc; + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets h_crc to 0 so we must consider this valid even on v5 supers. + * sets old_crc to 0 so we must consider this valid even on v5 supers. * Otherwise, return EFSBADCRC on failure so the callers up the stack * know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (rhead->h_crc && crc != rhead->h_crc) + if (old_crc && crc != old_crc) return -EFSBADCRC; return 0; } @@ -5136,11 +5138,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != rhead->h_crc) { - if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (crc != old_crc) { + if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(rhead->h_crc), + le32_to_cpu(old_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } -- cgit v1.2.3 From 6031e73a5b3f85ec45cac08ef90995b2d3f941c7 Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Wed, 7 Dec 2016 17:36:36 +1100 Subject: xfs: use rhashtable to track buffer cache On filesystems with a lot of metadata and in metadata intensive workloads xfs_buf_find() is showing up at the top of the CPU cycles trace. Most of the CPU time is spent on CPU cache misses while traversing the rbtree. As the buffer cache does not need any kind of ordering, but fast lookups a hashtable is the natural data structure to use. The rhashtable infrastructure provides a self-scaling hashtable implementation and allows lookups to proceed while the table is going through a resize operation. This reduces the CPU-time spent for the lookups to 1/3 even for small filesystems with a relatively small number of cached buffers, with possibly much larger gains on higher loaded filesystems. [dchinner: reduce minimum hash size to an acceptable size for large filesystems with many AGs with no active use.] [dchinner: remove stale rbtree asserts.] [dchinner: use xfs_buf_map for compare function argument.] [dchinner: make functions static.] [dchinner: remove redundant comments.] Signed-off-by: Lucas Stach Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_buf.c | 120 ++++++++++++++++++++++++++++++++--------------------- fs/xfs/xfs_buf.h | 2 +- fs/xfs/xfs_linux.h | 1 + fs/xfs/xfs_mount.c | 7 +++- fs/xfs/xfs_mount.h | 7 +++- 5 files changed, 85 insertions(+), 52 deletions(-) (limited to 'fs/xfs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b5b9bffe3520..516109424c96 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -219,7 +219,6 @@ _xfs_buf_alloc( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); - RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); @@ -473,6 +472,62 @@ _xfs_buf_map_pages( /* * Finding and Reading Buffers */ +static int +_xfs_buf_obj_cmp( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct xfs_buf_map *map = arg->key; + const struct xfs_buf *bp = obj; + + /* + * The key hashing in the lookup path depends on the key being the + * first element of the compare_arg, make sure to assert this. + */ + BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); + + if (bp->b_bn != map->bm_bn) + return 1; + + if (unlikely(bp->b_length != map->bm_len)) { + /* + * found a block number match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching for an exact match. + */ + ASSERT(bp->b_flags & XBF_STALE); + return 1; + } + return 0; +} + +static const struct rhashtable_params xfs_buf_hash_params = { + .min_size = 32, /* empty AGs have minimal footprint */ + .nelem_hint = 16, + .key_len = sizeof(xfs_daddr_t), + .key_offset = offsetof(struct xfs_buf, b_bn), + .head_offset = offsetof(struct xfs_buf, b_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = _xfs_buf_obj_cmp, +}; + +int +xfs_buf_hash_init( + struct xfs_perag *pag) +{ + spin_lock_init(&pag->pag_buf_lock); + return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); +} + +void +xfs_buf_hash_destroy( + struct xfs_perag *pag) +{ + rhashtable_destroy(&pag->pag_buf_hash); +} /* * Look up, and creates if absent, a lockable buffer for @@ -488,27 +543,24 @@ _xfs_buf_find( xfs_buf_t *new_bp) { struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent; xfs_buf_t *bp; - xfs_daddr_t blkno = map[0].bm_bn; + struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; xfs_daddr_t eofs; - int numblks = 0; int i; for (i = 0; i < nmaps; i++) - numblks += map[i].bm_len; + cmap.bm_len += map[i].bm_len; /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (blkno < 0 || blkno >= eofs) { + if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { /* * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports @@ -516,53 +568,29 @@ _xfs_buf_find( */ xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", - __func__, blkno, eofs); + __func__, cmap.bm_bn, eofs); WARN_ON(1); return NULL; } - /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, blkno)); + xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); - /* walk tree */ spin_lock(&pag->pag_buf_lock); - rbp = &pag->pag_buf_tree.rb_node; - parent = NULL; - bp = NULL; - while (*rbp) { - parent = *rbp; - bp = rb_entry(parent, struct xfs_buf, b_rbnode); - - if (blkno < bp->b_bn) - rbp = &(*rbp)->rb_left; - else if (blkno > bp->b_bn) - rbp = &(*rbp)->rb_right; - else { - /* - * found a block number match. If the range doesn't - * match, the only way this is allowed is if the buffer - * in the cache is stale and the transaction that made - * it stale has not yet committed. i.e. we are - * reallocating a busy extent. Skip this buffer and - * continue searching to the right for an exact match. - */ - if (bp->b_length != numblks) { - ASSERT(bp->b_flags & XBF_STALE); - rbp = &(*rbp)->rb_right; - continue; - } - atomic_inc(&bp->b_hold); - goto found; - } + bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, + xfs_buf_hash_params); + if (bp) { + atomic_inc(&bp->b_hold); + goto found; } /* No match found */ if (new_bp) { - rb_link_node(&new_bp->b_rbnode, parent, rbp); - rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, + &new_bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); } else { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); @@ -930,7 +958,6 @@ xfs_buf_rele( if (!pag) { ASSERT(list_empty(&bp->b_lru)); - ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) { xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); @@ -938,8 +965,6 @@ xfs_buf_rele( return; } - ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); - ASSERT(atomic_read(&bp->b_hold) > 0); release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); @@ -983,7 +1008,8 @@ xfs_buf_rele( } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); freebuf = true; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 38c9a95bda7d..8a9d3a9599f0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -151,7 +151,7 @@ typedef struct xfs_buf { * which is the only bit that is touched if we hit the semaphore * fast-path on locking. */ - struct rb_node b_rbnode; /* rbtree node */ + struct rhash_head b_rhash_head; /* pag buffer hash node */ xfs_daddr_t b_bn; /* block number of buffer */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 68640fb63a54..a415f822f2c1 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -78,6 +78,7 @@ typedef __u32 xfs_nlink_t; #include #include #include +#include #include #include diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b341f10cf481..9b9540db17a6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -157,6 +157,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -212,8 +213,8 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - spin_lock_init(&pag->pag_buf_lock); - pag->pag_buf_tree = RB_ROOT; + if (xfs_buf_hash_init(pag)) + goto out_unwind; if (radix_tree_preload(GFP_NOFS)) goto out_unwind; @@ -239,9 +240,11 @@ xfs_initialize_perag( return 0; out_unwind: + xfs_buf_hash_destroy(pag); kmem_free(pag); for (; index > first_initialised; index--) { pag = radix_tree_delete(&mp->m_perag_tree, index); + xfs_buf_hash_destroy(pag); kmem_free(pag); } return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 819b80b15bfb..84f785218907 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -393,8 +393,8 @@ typedef struct xfs_perag { unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ - struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ + struct rhashtable pag_buf_hash; /* for rcu-safe freeing */ struct rcu_head rcu_head; @@ -424,6 +424,9 @@ xfs_perag_resv( } } +int xfs_buf_hash_init(xfs_perag_t *pag); +void xfs_buf_hash_destroy(xfs_perag_t *pag); + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); -- cgit v1.2.3