diff options
Diffstat (limited to 'fs/xfs/scrub')
-rw-r--r-- | fs/xfs/scrub/agheader.c | 20 | ||||
-rw-r--r-- | fs/xfs/scrub/common.c | 47 | ||||
-rw-r--r-- | fs/xfs/scrub/common.h | 4 | ||||
-rw-r--r-- | fs/xfs/scrub/fscounters.c | 366 | ||||
-rw-r--r-- | fs/xfs/scrub/health.c | 237 | ||||
-rw-r--r-- | fs/xfs/scrub/health.h | 14 | ||||
-rw-r--r-- | fs/xfs/scrub/ialloc.c | 4 | ||||
-rw-r--r-- | fs/xfs/scrub/parent.c | 2 | ||||
-rw-r--r-- | fs/xfs/scrub/quota.c | 2 | ||||
-rw-r--r-- | fs/xfs/scrub/repair.c | 34 | ||||
-rw-r--r-- | fs/xfs/scrub/repair.h | 5 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.c | 49 | ||||
-rw-r--r-- | fs/xfs/scrub/scrub.h | 27 | ||||
-rw-r--r-- | fs/xfs/scrub/trace.h | 63 |
14 files changed, 821 insertions, 53 deletions
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ddf06bfaa29d..adaeabdefdd3 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -514,6 +514,7 @@ xchk_agf( { struct xfs_mount *mp = sc->mp; struct xfs_agf *agf; + struct xfs_perag *pag; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_agblock_t eoag; @@ -586,6 +587,16 @@ xchk_agf( if (agfl_count != 0 && fl_count != agfl_count) xchk_block_set_corrupt(sc, sc->sa.agf_bp); + /* Do the incore counters match? */ + pag = xfs_perag_get(mp, agno); + if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + if (pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) + xchk_block_set_corrupt(sc, sc->sa.agf_bp); + xfs_perag_put(pag); + xchk_agf_xref(sc); out: return error; @@ -811,6 +822,7 @@ xchk_agi( { struct xfs_mount *mp = sc->mp; struct xfs_agi *agi; + struct xfs_perag *pag; xfs_agnumber_t agno; xfs_agblock_t agbno; xfs_agblock_t eoag; @@ -881,6 +893,14 @@ xchk_agi( if (agi->agi_pad32 != cpu_to_be32(0)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); + /* Do the incore counters match? */ + pag = xfs_perag_get(mp, agno); + if (pag->pagi_count != be32_to_cpu(agi->agi_count)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) + xchk_block_set_corrupt(sc, sc->sa.agi_bp); + xfs_perag_put(pag); + xchk_agi_xref(sc); out: return error; diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 0c54ff55b901..973aa59975e3 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -38,6 +38,7 @@ #include "scrub/trace.h" #include "scrub/btree.h" #include "scrub/repair.h" +#include "scrub/health.h" /* Common code for the metadata scrubbers. */ @@ -208,6 +209,15 @@ xchk_ino_set_preen( trace_xchk_ino_preen(sc, ino, __return_address); } +/* Record something being wrong with the filesystem primary superblock. */ +void +xchk_set_corrupt( + struct xfs_scrub *sc) +{ + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + trace_xchk_fs_error(sc, 0, __return_address); +} + /* Record a corrupt block. */ void xchk_block_set_corrupt( @@ -458,13 +468,18 @@ xchk_ag_btcur_init( struct xfs_mount *mp = sc->mp; xfs_agnumber_t agno = sa->agno; - if (sa->agf_bp) { + xchk_perag_get(sc->mp, sa); + if (sa->agf_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { /* Set up a bnobt cursor for cross-referencing. */ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, agno, XFS_BTNUM_BNO); if (!sa->bno_cur) goto err; + } + if (sa->agf_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) { /* Set up a cntbt cursor for cross-referencing. */ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp, agno, XFS_BTNUM_CNT); @@ -473,7 +488,8 @@ xchk_ag_btcur_init( } /* Set up a inobt cursor for cross-referencing. */ - if (sa->agi_bp) { + if (sa->agi_bp && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) { sa->ino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, agno, XFS_BTNUM_INO); if (!sa->ino_cur) @@ -481,7 +497,8 @@ xchk_ag_btcur_init( } /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, agno, XFS_BTNUM_FINO); if (!sa->fino_cur) @@ -489,7 +506,8 @@ xchk_ag_btcur_init( } /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, agno); if (!sa->rmap_cur) @@ -497,7 +515,8 @@ xchk_ag_btcur_init( } /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb)) { + if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) && + xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, sa->agf_bp, agno); if (!sa->refc_cur) @@ -884,3 +903,21 @@ xchk_ilock_inverted( } return -EDEADLOCK; } + +/* Pause background reaping of resources. */ +void +xchk_stop_reaping( + struct xfs_scrub *sc) +{ + sc->flags |= XCHK_REAPING_DISABLED; + xfs_stop_block_reaping(sc->mp); +} + +/* Restart background reaping of resources. */ +void +xchk_start_reaping( + struct xfs_scrub *sc) +{ + xfs_start_block_reaping(sc->mp); + sc->flags &= ~XCHK_REAPING_DISABLED; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index e26a430bd466..003a772cd26c 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -39,6 +39,7 @@ void xchk_block_set_preen(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_preen(struct xfs_scrub *sc, xfs_ino_t ino); +void xchk_set_corrupt(struct xfs_scrub *sc); void xchk_block_set_corrupt(struct xfs_scrub *sc, struct xfs_buf *bp); void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino); @@ -105,6 +106,7 @@ xchk_setup_quota(struct xfs_scrub *sc, struct xfs_inode *ip) return -ENOENT; } #endif +int xchk_setup_fscounters(struct xfs_scrub *sc, struct xfs_inode *ip); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, @@ -137,5 +139,7 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm) int xchk_metadata_inode_forks(struct xfs_scrub *sc); int xchk_ilock_inverted(struct xfs_inode *ip, uint lock_mode); +void xchk_stop_reaping(struct xfs_scrub *sc); +void xchk_start_reaping(struct xfs_scrub *sc); #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c new file mode 100644 index 000000000000..07c11e3e6437 --- /dev/null +++ b/fs/xfs/scrub/fscounters.c @@ -0,0 +1,366 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_error.h" +#include "xfs_errortag.h" +#include "xfs_icache.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* + * FS Summary Counters + * =================== + * + * The basics of filesystem summary counter checking are that we iterate the + * AGs counting the number of free blocks, free space btree blocks, per-AG + * reservations, inodes, delayed allocation reservations, and free inodes. + * Then we compare what we computed against the in-core counters. + * + * However, the reality is that summary counters are a tricky beast to check. + * While we /could/ freeze the filesystem and scramble around the AGs counting + * the free blocks, in practice we prefer not do that for a scan because + * freezing is costly. To get around this, we added a per-cpu counter of the + * delalloc reservations so that we can rotor around the AGs relatively + * quickly, and we allow the counts to be slightly off because we're not taking + * any locks while we do this. + * + * So the first thing we do is warm up the buffer cache in the setup routine by + * walking all the AGs to make sure the incore per-AG structure has been + * initialized. The expected value calculation then iterates the incore per-AG + * structures as quickly as it can. We snapshot the percpu counters before and + * after this operation and use the difference in counter values to guess at + * our tolerance for mismatch between expected and actual counter values. + */ + +/* + * Since the expected value computation is lockless but only browses incore + * values, the percpu counters should be fairly close to each other. However, + * we'll allow ourselves to be off by at least this (arbitrary) amount. + */ +#define XCHK_FSCOUNT_MIN_VARIANCE (512) + +/* + * Make sure the per-AG structure has been initialized from the on-disk header + * contents and trust that the incore counters match the ondisk counters. (The + * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the + * summary counters after checking all AG headers). Do this from the setup + * function so that the inner AG aggregation loop runs as quickly as possible. + * + * This function runs during the setup phase /before/ we start checking any + * metadata. + */ +STATIC int +xchk_fscount_warmup( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *agi_bp = NULL; + struct xfs_buf *agf_bp = NULL; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno; + int error = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + + if (pag->pagi_init && pag->pagf_init) + goto next_loop_perag; + + /* Lock both AG headers. */ + error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp); + if (error) + break; + error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp); + if (error) + break; + error = -ENOMEM; + if (!agf_bp || !agi_bp) + break; + + /* + * These are supposed to be initialized by the header read + * function. + */ + error = -EFSCORRUPTED; + if (!pag->pagi_init || !pag->pagf_init) + break; + + xfs_buf_relse(agf_bp); + agf_bp = NULL; + xfs_buf_relse(agi_bp); + agi_bp = NULL; +next_loop_perag: + xfs_perag_put(pag); + pag = NULL; + error = 0; + + if (fatal_signal_pending(current)) + break; + } + + if (agf_bp) + xfs_buf_relse(agf_bp); + if (agi_bp) + xfs_buf_relse(agi_bp); + if (pag) + xfs_perag_put(pag); + return error; +} + +int +xchk_setup_fscounters( + struct xfs_scrub *sc, + struct xfs_inode *ip) +{ + struct xchk_fscounters *fsc; + int error; + + sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), KM_SLEEP); + if (!sc->buf) + return -ENOMEM; + fsc = sc->buf; + + xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); + + /* We must get the incore counters set up before we can proceed. */ + error = xchk_fscount_warmup(sc); + if (error) + return error; + + /* + * Pause background reclaim while we're scrubbing to reduce the + * likelihood of background perturbations to the counters throwing off + * our calculations. + */ + xchk_stop_reaping(sc); + + return xchk_trans_alloc(sc, 0); +} + +/* + * Calculate what the global in-core counters ought to be from the incore + * per-AG structure. Callers can compare this to the actual in-core counters + * to estimate by how much both in-core and on-disk counters need to be + * adjusted. + */ +STATIC int +xchk_fscount_aggregate_agcounts( + struct xfs_scrub *sc, + struct xchk_fscounters *fsc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_perag *pag; + uint64_t delayed; + xfs_agnumber_t agno; + int tries = 8; + +retry: + fsc->icount = 0; + fsc->ifree = 0; + fsc->fdblocks = 0; + + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + pag = xfs_perag_get(mp, agno); + + /* This somehow got unset since the warmup? */ + if (!pag->pagi_init || !pag->pagf_init) { + xfs_perag_put(pag); + return -EFSCORRUPTED; + } + + /* Count all the inodes */ + fsc->icount += pag->pagi_count; + fsc->ifree += pag->pagi_freecount; + + /* Add up the free/freelist/bnobt/cntbt blocks */ + fsc->fdblocks += pag->pagf_freeblks; + fsc->fdblocks += pag->pagf_flcount; + fsc->fdblocks += pag->pagf_btreeblks; + + /* + * Per-AG reservations are taken out of the incore counters, + * so they must be left out of the free blocks computation. + */ + fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; + fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; + + xfs_perag_put(pag); + + if (fatal_signal_pending(current)) + break; + } + + /* + * The global incore space reservation is taken from the incore + * counters, so leave that out of the computation. + */ + fsc->fdblocks -= mp->m_resblks_avail; + + /* + * Delayed allocation reservations are taken out of the incore counters + * but not recorded on disk, so leave them and their indlen blocks out + * of the computation. + */ + delayed = percpu_counter_sum(&mp->m_delalloc_blks); + fsc->fdblocks -= delayed; + + trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, + delayed); + + + /* Bail out if the values we compute are totally nonsense. */ + if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || + fsc->fdblocks > mp->m_sb.sb_dblocks || + fsc->ifree > fsc->icount_max) + return -EFSCORRUPTED; + + /* + * If ifree > icount then we probably had some perturbation in the + * counters while we were calculating things. We'll try a few times + * to maintain ifree <= icount before giving up. + */ + if (fsc->ifree > fsc->icount) { + if (tries--) + goto retry; + xchk_set_incomplete(sc); + return 0; + } + + return 0; +} + +/* + * Is the @counter reasonably close to the @expected value? + * + * We neither locked nor froze anything in the filesystem while aggregating the + * per-AG data to compute the @expected value, which means that the counter + * could have changed. We know the @old_value of the summation of the counter + * before the aggregation, and we re-sum the counter now. If the expected + * value falls between the two summations, we're ok. + * + * Otherwise, we /might/ have a problem. If the change in the summations is + * more than we want to tolerate, the filesystem is probably busy and we should + * just send back INCOMPLETE and see if userspace will try again. + */ +static inline bool +xchk_fscount_within_range( + struct xfs_scrub *sc, + const int64_t old_value, + struct percpu_counter *counter, + uint64_t expected) +{ + int64_t min_value, max_value; + int64_t curr_value = percpu_counter_sum(counter); + + trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, + old_value); + + /* Negative values are always wrong. */ + if (curr_value < 0) + return false; + + /* Exact matches are always ok. */ + if (curr_value == expected) + return true; + + min_value = min(old_value, curr_value); + max_value = max(old_value, curr_value); + + /* Within the before-and-after range is ok. */ + if (expected >= min_value && expected <= max_value) + return true; + + /* + * If the difference between the two summations is too large, the fs + * might just be busy and so we'll mark the scrub incomplete. Return + * true here so that we don't mark the counter corrupt. + * + * XXX: In the future when userspace can grant scrub permission to + * quiesce the filesystem to solve the outsized variance problem, this + * check should be moved up and the return code changed to signal to + * userspace that we need quiesce permission. + */ + if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { + xchk_set_incomplete(sc); + return true; + } + + return false; +} + +/* Check the superblock counters. */ +int +xchk_fscounters( + struct xfs_scrub *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xchk_fscounters *fsc = sc->buf; + int64_t icount, ifree, fdblocks; + int error; + + /* Snapshot the percpu counters. */ + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); + + /* No negative values, please! */ + if (icount < 0 || ifree < 0 || fdblocks < 0) + xchk_set_corrupt(sc); + + /* See if icount is obviously wrong. */ + if (icount < fsc->icount_min || icount > fsc->icount_max) + xchk_set_corrupt(sc); + + /* See if fdblocks is obviously wrong. */ + if (fdblocks > mp->m_sb.sb_dblocks) + xchk_set_corrupt(sc); + + /* + * If ifree exceeds icount by more than the minimum variance then + * something's probably wrong with the counters. + */ + if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) + xchk_set_corrupt(sc); + + /* Walk the incore AG headers to calculate the expected counters. */ + error = xchk_fscount_aggregate_agcounts(sc, fsc); + if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) + return error; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) + return 0; + + /* Compare the in-core counters with whatever we counted. */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) + xchk_set_corrupt(sc); + + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) + xchk_set_corrupt(sc); + + if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, + fsc->fdblocks)) + xchk_set_corrupt(sc); + + return 0; +} diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c new file mode 100644 index 000000000000..23cf8e2f25db --- /dev/null +++ b/fs/xfs/scrub/health.c @@ -0,0 +1,237 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_health.h" +#include "scrub/scrub.h" +#include "scrub/health.h" + +/* + * Scrub and In-Core Filesystem Health Assessments + * =============================================== + * + * Online scrub and repair have the time and the ability to perform stronger + * checks than we can do from the metadata verifiers, because they can + * cross-reference records between data structures. Therefore, scrub is in a + * good position to update the online filesystem health assessments to reflect + * the good/bad state of the data structure. + * + * We therefore extend scrub in the following ways to achieve this: + * + * 1. Create a "sick_mask" field in the scrub context. When we're setting up a + * scrub call, set this to the default XFS_SICK_* flag(s) for the selected + * scrub type (call it A). Scrub and repair functions can override the default + * sick_mask value if they choose. + * + * 2. If the scrubber returns a runtime error code, we exit making no changes + * to the incore sick state. + * + * 3. If the scrubber finds that A is clean, use sick_mask to clear the incore + * sick flags before exiting. + * + * 4. If the scrubber finds that A is corrupt, use sick_mask to set the incore + * sick flags. If the user didn't want to repair then we exit, leaving the + * metadata structure unfixed and the sick flag set. + * + * 5. Now we know that A is corrupt and the user wants to repair, so run the + * repairer. If the repairer returns an error code, we exit with that error + * code, having made no further changes to the incore sick state. + * + * 6. If repair rebuilds A correctly and the subsequent re-scrub of A is clean, + * use sick_mask to clear the incore sick flags. This should have the effect + * that A is no longer marked sick. + * + * 7. If repair rebuilds A incorrectly, the re-scrub will find it corrupt and + * use sick_mask to set the incore sick flags. This should have no externally + * visible effect since we already set them in step (4). + * + * There are some complications to this story, however. For certain types of + * complementary metadata indices (e.g. inobt/finobt), it is easier to rebuild + * both structures at the same time. The following principles apply to this + * type of repair strategy: + * + * 8. Any repair function that rebuilds multiple structures should update + * sick_mask_visible to reflect whatever other structures are rebuilt, and + * verify that all the rebuilt structures can pass a scrub check. The outcomes + * of 5-7 still apply, but with a sick_mask that covers everything being + * rebuilt. + */ + +/* Map our scrub type to a sick mask and a set of health update functions. */ + +enum xchk_health_group { + XHG_FS = 1, + XHG_RT, + XHG_AG, + XHG_INO, +}; + +struct xchk_health_map { + enum xchk_health_group group; + unsigned int sick_mask; +}; + +static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = { + [XFS_SCRUB_TYPE_SB] = { XHG_AG, XFS_SICK_AG_SB }, + [XFS_SCRUB_TYPE_AGF] = { XHG_AG, XFS_SICK_AG_AGF }, + [XFS_SCRUB_TYPE_AGFL] = { XHG_AG, XFS_SICK_AG_AGFL }, + [XFS_SCRUB_TYPE_AGI] = { XHG_AG, XFS_SICK_AG_AGI }, + [XFS_SCRUB_TYPE_BNOBT] = { XHG_AG, XFS_SICK_AG_BNOBT }, + [XFS_SCRUB_TYPE_CNTBT] = { XHG_AG, XFS_SICK_AG_CNTBT }, + [XFS_SCRUB_TYPE_INOBT] = { XHG_AG, XFS_SICK_AG_INOBT }, + [XFS_SCRUB_TYPE_FINOBT] = { XHG_AG, XFS_SICK_AG_FINOBT }, + [XFS_SCRUB_TYPE_RMAPBT] = { XHG_AG, XFS_SICK_AG_RMAPBT }, + [XFS_SCRUB_TYPE_REFCNTBT] = { XHG_AG, XFS_SICK_AG_REFCNTBT }, + [XFS_SCRUB_TYPE_INODE] = { XHG_INO, XFS_SICK_INO_CORE }, + [XFS_SCRUB_TYPE_BMBTD] = { XHG_INO, XFS_SICK_INO_BMBTD }, + [XFS_SCRUB_TYPE_BMBTA] = { XHG_INO, XFS_SICK_INO_BMBTA }, + [XFS_SCRUB_TYPE_BMBTC] = { XHG_INO, XFS_SICK_INO_BMBTC }, + [XFS_SCRUB_TYPE_DIR] = { XHG_INO, XFS_SICK_INO_DIR }, + [XFS_SCRUB_TYPE_XATTR] = { XHG_INO, XFS_SICK_INO_XATTR }, + [XFS_SCRUB_TYPE_SYMLINK] = { XHG_INO, XFS_SICK_INO_SYMLINK }, + [XFS_SCRUB_TYPE_PARENT] = { XHG_INO, XFS_SICK_INO_PARENT }, + [XFS_SCRUB_TYPE_RTBITMAP] = { XHG_RT, XFS_SICK_RT_BITMAP }, + [XFS_SCRUB_TYPE_RTSUM] = { XHG_RT, XFS_SICK_RT_SUMMARY }, + [XFS_SCRUB_TYPE_UQUOTA] = { XHG_FS, XFS_SICK_FS_UQUOTA }, + [XFS_SCRUB_TYPE_GQUOTA] = { XHG_FS, XFS_SICK_FS_GQUOTA }, + [XFS_SCRUB_TYPE_PQUOTA] = { XHG_FS, XFS_SICK_FS_PQUOTA }, + [XFS_SCRUB_TYPE_FSCOUNTERS] = { XHG_FS, XFS_SICK_FS_COUNTERS }, +}; + +/* Return the health status mask for this scrub type. */ +unsigned int +xchk_health_mask_for_scrub_type( + __u32 scrub_type) +{ + return type_to_health_flag[scrub_type].sick_mask; +} + +/* + * Update filesystem health assessments based on what we found and did. + * + * If the scrubber finds errors, we mark sick whatever's mentioned in + * sick_mask, no matter whether this is a first scan or an + * evaluation of repair effectiveness. + * + * Otherwise, no direct corruption was found, so mark whatever's in + * sick_mask as healthy. + */ +void +xchk_update_health( + struct xfs_scrub *sc) +{ + struct xfs_perag *pag; + bool bad; + + if (!sc->sick_mask) + return; + + bad = (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT); + switch (type_to_health_flag[sc->sm->sm_type].group) { + case XHG_AG: + pag = xfs_perag_get(sc->mp, sc->sm->sm_agno); + if (bad) + xfs_ag_mark_sick(pag, sc->sick_mask); + else + xfs_ag_mark_healthy(pag, sc->sick_mask); + xfs_perag_put(pag); + break; + case XHG_INO: + if (!sc->ip) + return; + if (bad) + xfs_inode_mark_sick(sc->ip, sc->sick_mask); + else + xfs_inode_mark_healthy(sc->ip, sc->sick_mask); + break; + case XHG_FS: + if (bad) + xfs_fs_mark_sick(sc->mp, sc->sick_mask); + else + xfs_fs_mark_healthy(sc->mp, sc->sick_mask); + break; + case XHG_RT: + if (bad) + xfs_rt_mark_sick(sc->mp, sc->sick_mask); + else + xfs_rt_mark_healthy(sc->mp, sc->sick_mask); + break; + default: + ASSERT(0); + break; + } +} + +/* Is the given per-AG btree healthy enough for scanning? */ +bool +xchk_ag_btree_healthy_enough( + struct xfs_scrub *sc, + struct xfs_perag *pag, + xfs_btnum_t btnum) +{ + unsigned int mask = 0; + + /* + * We always want the cursor if it's the same type as whatever we're + * scrubbing, even if we already know the structure is corrupt. + * + * Otherwise, we're only interested in the btree for cross-referencing. + * If we know the btree is bad then don't bother, just set XFAIL. + */ + switch (btnum) { + case XFS_BTNUM_BNO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT) + return true; + mask = XFS_SICK_AG_BNOBT; + break; + case XFS_BTNUM_CNT: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT) + return true; + mask = XFS_SICK_AG_CNTBT; + break; + case XFS_BTNUM_INO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT) + return true; + mask = XFS_SICK_AG_INOBT; + break; + case XFS_BTNUM_FINO: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT) + return true; + mask = XFS_SICK_AG_FINOBT; + break; + case XFS_BTNUM_RMAP: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT) + return true; + mask = XFS_SICK_AG_RMAPBT; + break; + case XFS_BTNUM_REFC: + if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT) + return true; + mask = XFS_SICK_AG_REFCNTBT; + break; + default: + ASSERT(0); + return true; + } + + if (xfs_ag_has_sickness(pag, mask)) { + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL; + return false; + } + + return true; +} diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h new file mode 100644 index 000000000000..d0b938d3d028 --- /dev/null +++ b/fs/xfs/scrub/health.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@oracle.com> + */ +#ifndef __XFS_SCRUB_HEALTH_H__ +#define __XFS_SCRUB_HEALTH_H__ + +unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type); +void xchk_update_health(struct xfs_scrub *sc); +bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag, + xfs_btnum_t btnum); + +#endif /* __XFS_SCRUB_HEALTH_H__ */ diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 700114f79a7d..693eb51f5efb 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -39,7 +39,7 @@ xchk_setup_ag_iallocbt( struct xfs_scrub *sc, struct xfs_inode *ip) { - return xchk_setup_ag_btree(sc, ip, sc->try_harder); + return xchk_setup_ag_btree(sc, ip, sc->flags & XCHK_TRY_HARDER); } /* Inode btree scrubber. */ @@ -185,7 +185,7 @@ xchk_iallocbt_check_cluster_ifree( if (error == -ENODATA) { /* Not cached, just read the disk buffer */ freemask_ok = irec_free ^ !!(dip->di_mode); - if (!bs->sc->try_harder && !freemask_ok) + if (!(bs->sc->flags & XCHK_TRY_HARDER) && !freemask_ok) return -EDEADLOCK; } else if (error < 0) { /* diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 1c9d7c7f64f5..d5d197f1b80f 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -320,7 +320,7 @@ out: * If we failed to lock the parent inode even after a retry, just mark * this scrub incomplete and return. */ - if (sc->try_harder && error == -EDEADLOCK) { + if ((sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { error = 0; xchk_set_incomplete(sc); } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 782d582d3edd..5dfe2b5924db 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -60,7 +60,7 @@ xchk_setup_quota( dqtype = xchk_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; - sc->has_quotaofflock = true; + sc->flags |= XCHK_HAS_QUOTAOFFLOCK; mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index f28f4bad317b..eb358f0f5e0a 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -46,8 +46,7 @@ int xrep_attempt( struct xfs_inode *ip, - struct xfs_scrub *sc, - bool *fixed) + struct xfs_scrub *sc) { int error = 0; @@ -66,13 +65,13 @@ xrep_attempt( * scrub so that we can tell userspace if we fixed the problem. */ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; - *fixed = true; + sc->flags |= XREP_ALREADY_FIXED; return -EAGAIN; case -EDEADLOCK: case -EAGAIN: /* Tell the caller to try again having grabbed all the locks. */ - if (!sc->try_harder) { - sc->try_harder = true; + if (!(sc->flags & XCHK_TRY_HARDER)) { + sc->flags |= XCHK_TRY_HARDER; return -EAGAIN; } /* @@ -137,10 +136,16 @@ xrep_roll_ag_trans( if (sc->sa.agfl_bp) xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); - /* Roll the transaction. */ + /* + * Roll the transaction. We still own the buffer and the buffer lock + * regardless of whether or not the roll succeeds. If the roll fails, + * the buffers will be released during teardown on our way out of the + * kernel. If it succeeds, we join them to the new transaction and + * move on. + */ error = xfs_trans_roll(&sc->tp); if (error) - goto out_release; + return error; /* Join AG headers to the new transaction. */ if (sc->sa.agi_bp) @@ -151,21 +156,6 @@ xrep_roll_ag_trans( xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); return 0; - -out_release: - /* - * Rolling failed, so release the hold on the buffers. The - * buffers will be released during teardown on our way out - * of the kernel. - */ - if (sc->sa.agi_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); - if (sc->sa.agf_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); - if (sc->sa.agfl_bp) - xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp); - - return error; } /* diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index d990314eb08b..60c61d7052a8 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -15,7 +15,7 @@ static inline int xrep_notsupported(struct xfs_scrub *sc) /* Repair helpers */ -int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc, bool *fixed); +int xrep_attempt(struct xfs_inode *ip, struct xfs_scrub *sc); void xrep_failure(struct xfs_mount *mp); int xrep_roll_ag_trans(struct xfs_scrub *sc); bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, @@ -64,8 +64,7 @@ int xrep_agi(struct xfs_scrub *sc); static inline int xrep_attempt( struct xfs_inode *ip, - struct xfs_scrub *sc, - bool *fixed) + struct xfs_scrub *sc) { return -EOPNOTSUPP; } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 1b2344d00525..f630389ee176 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -40,6 +40,7 @@ #include "scrub/trace.h" #include "scrub/btree.h" #include "scrub/repair.h" +#include "scrub/health.h" /* * Online Scrub and Repair @@ -186,8 +187,12 @@ xchk_teardown( xfs_irele(sc->ip); sc->ip = NULL; } - if (sc->has_quotaofflock) + if (sc->flags & XCHK_REAPING_DISABLED) + xchk_start_reaping(sc); + if (sc->flags & XCHK_HAS_QUOTAOFFLOCK) { mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); + sc->flags &= ~XCHK_HAS_QUOTAOFFLOCK; + } if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; @@ -347,6 +352,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .scrub = xchk_quota, .repair = xrep_notsupported, }, + [XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */ + .type = ST_FS, + .setup = xchk_setup_fscounters, + .scrub = xchk_fscounters, + .repair = xrep_notsupported, + }, }; /* This isn't a stable feature, warn once per day. */ @@ -466,10 +477,14 @@ xfs_scrub_metadata( struct xfs_inode *ip, struct xfs_scrub_metadata *sm) { - struct xfs_scrub sc; + struct xfs_scrub sc = { + .mp = ip->i_mount, + .sm = sm, + .sa = { + .agno = NULLAGNUMBER, + }, + }; struct xfs_mount *mp = ip->i_mount; - bool try_harder = false; - bool already_fixed = false; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -491,21 +506,17 @@ xfs_scrub_metadata( xchk_experimental_warning(mp); + sc.ops = &meta_scrub_ops[sm->sm_type]; + sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: /* Set up for the operation. */ - memset(&sc, 0, sizeof(sc)); - sc.mp = ip->i_mount; - sc.sm = sm; - sc.ops = &meta_scrub_ops[sm->sm_type]; - sc.try_harder = try_harder; - sc.sa.agno = NULLAGNUMBER; error = sc.ops->setup(&sc, ip); if (error) goto out_teardown; /* Scrub for errors. */ error = sc.ops->scrub(&sc); - if (!try_harder && error == -EDEADLOCK) { + if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { /* * Scrubbers return -EDEADLOCK to mean 'try harder'. * Tear down everything we hold, then set up again with @@ -514,12 +525,15 @@ retry_op: error = xchk_teardown(&sc, ip, 0); if (error) goto out; - try_harder = true; + sc.flags |= XCHK_TRY_HARDER; goto retry_op; } else if (error) goto out_teardown; - if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) { + xchk_update_health(&sc); + + if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc.flags & XREP_ALREADY_FIXED)) { bool needs_fix; /* Let debug users force us into the repair routines. */ @@ -542,10 +556,13 @@ retry_op: * If it's broken, userspace wants us to fix it, and we haven't * already tried to fix it, then attempt a repair. */ - error = xrep_attempt(ip, &sc, &already_fixed); + error = xrep_attempt(ip, &sc); if (error == -EAGAIN) { - if (sc.try_harder) - try_harder = true; + /* + * Either the repair function succeeded or it couldn't + * get all the resources it needs; either way, we go + * back to the beginning and call the scrub function. + */ error = xchk_teardown(&sc, ip, 0); if (error) { xrep_failure(mp); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 22f754fba8e5..ad1ceb44a628 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -62,13 +62,27 @@ struct xfs_scrub { struct xfs_inode *ip; void *buf; uint ilock_flags; - bool try_harder; - bool has_quotaofflock; + + /* See the XCHK/XREP state flags below. */ + unsigned int flags; + + /* + * The XFS_SICK_* flags that correspond to the metadata being scrubbed + * or repaired. We will use this mask to update the in-core fs health + * status with whatever we find. + */ + unsigned int sick_mask; /* State tracking for single-AG operations. */ struct xchk_ag sa; }; +/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ +#define XCHK_TRY_HARDER (1 << 0) /* can't get resources, try again */ +#define XCHK_HAS_QUOTAOFFLOCK (1 << 1) /* we hold the quotaoff lock */ +#define XCHK_REAPING_DISABLED (1 << 2) /* background block reaping paused */ +#define XREP_ALREADY_FIXED (1 << 31) /* checking our repair work */ + /* Metadata scrubbers */ int xchk_tester(struct xfs_scrub *sc); int xchk_superblock(struct xfs_scrub *sc); @@ -113,6 +127,7 @@ xchk_quota(struct xfs_scrub *sc) return -ENOENT; } #endif +int xchk_fscounters(struct xfs_scrub *sc); /* cross-referencing helpers */ void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno, @@ -138,4 +153,12 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno, # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0) #endif +struct xchk_fscounters { + uint64_t icount; + uint64_t ifree; + uint64_t fdblocks; + unsigned long long icount_min; + unsigned long long icount_max; +}; + #endif /* __XFS_SCRUB_SCRUB_H__ */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 3c83e8b3b39c..3362bae28b46 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -50,6 +50,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA); TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); #define XFS_SCRUB_TYPE_STRINGS \ { XFS_SCRUB_TYPE_PROBE, "probe" }, \ @@ -75,7 +76,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA); { XFS_SCRUB_TYPE_RTSUM, "rtsummary" }, \ { XFS_SCRUB_TYPE_UQUOTA, "usrquota" }, \ { XFS_SCRUB_TYPE_GQUOTA, "grpquota" }, \ - { XFS_SCRUB_TYPE_PQUOTA, "prjquota" } + { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ + { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, @@ -223,6 +225,7 @@ DEFINE_EVENT(xchk_block_error_class, name, \ void *ret_ip), \ TP_ARGS(sc, daddr, ret_ip)) +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error); DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error); DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen); @@ -590,6 +593,64 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __entry->cluster_ino) ) +TRACE_EVENT(xchk_fscounters_calc, + TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree, + uint64_t fdblocks, uint64_t delalloc), + TP_ARGS(mp, icount, ifree, fdblocks, delalloc), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int64_t, icount_sb) + __field(uint64_t, icount_calculated) + __field(int64_t, ifree_sb) + __field(uint64_t, ifree_calculated) + __field(int64_t, fdblocks_sb) + __field(uint64_t, fdblocks_calculated) + __field(uint64_t, delalloc) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->icount_sb = mp->m_sb.sb_icount; + __entry->icount_calculated = icount; + __entry->ifree_sb = mp->m_sb.sb_ifree; + __entry->ifree_calculated = ifree; + __entry->fdblocks_sb = mp->m_sb.sb_fdblocks; + __entry->fdblocks_calculated = fdblocks; + __entry->delalloc = delalloc; + ), + TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->icount_sb, + __entry->icount_calculated, + __entry->ifree_sb, + __entry->ifree_calculated, + __entry->fdblocks_sb, + __entry->fdblocks_calculated, + __entry->delalloc) +) + +TRACE_EVENT(xchk_fscounters_within_range, + TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value, + int64_t old_value), + TP_ARGS(mp, expected, curr_value, old_value), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint64_t, expected) + __field(int64_t, curr_value) + __field(int64_t, old_value) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->expected = expected; + __entry->curr_value = curr_value; + __entry->old_value = old_value; + ), + TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->expected, + __entry->curr_value, + __entry->old_value) +) + /* repair tracepoints */ #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) |