xfs: create simplified inode walk function

Create a new iterator function to simplify walking inodes in an XFS filesystem. This new iterator will replace the existing open-coded walking that goes on in various places. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com>
author: Darrick J. Wong <darrick.wong@oracle.com> 2019-07-02 09:39:38 -0700
committer: Darrick J. Wong <darrick.wong@oracle.com> 2019-07-02 09:40:05 -0700
commit: a211432c27ffa32d9978f6c18f5af0c3f8ad2ad1 (patch)
tree: e9eea5c3d498f02fc9f2ccbaa3aced4169b7e6e8 /fs/xfs/xfs_iwalk.c
parent: 5bb46e3e180d28c7ee6715f47c344be366a3fcc8 (diff)
download: linux-a211432c27ffa32d9978f6c18f5af0c3f8ad2ad1.tar.bz2
1 files changed, 389 insertions, 0 deletions
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
new file mode 100644
index 000000000000..304c41e6ed1d
--- /dev/null
+++ b/fs/xfs/xfs_iwalk.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_iwalk.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_health.h"
+#include "xfs_trans.h"
+
+/*
+ * Walking Inodes in the Filesystem
+ * ================================
+ *
+ * This iterator function walks a subset of filesystem inodes in increasing
+ * order from @startino until there are no more inodes.  For each allocated
+ * inode it finds, it calls a walk function with the relevant inode number and
+ * a pointer to caller-provided data.  The walk function can return the usual
+ * negative error code to stop the iteration; 0 to continue the iteration; or
+ * XFS_IWALK_ABORT to stop the iteration.  This return value is returned to the
+ * caller.
+ *
+ * Internally, we allow the walk function to do anything, which means that we
+ * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
+ * therefore cache the inobt records in kernel memory and only call the walk
+ * function when our memory buffer is full.  @nr_recs is the number of records
+ * that we've cached, and @sz_recs is the size of our cache.
+ *
+ * It is the responsibility of the walk function to ensure it accesses
+ * allocated inodes, as the inobt records may be stale by the time they are
+ * acted upon.
+ */
+
+struct xfs_iwalk_ag {
+	struct xfs_mount		*mp;
+	struct xfs_trans		*tp;
+
+	/* Where do we start the traversal? */
+	xfs_ino_t			startino;
+
+	/* Array of inobt records we cache. */
+	struct xfs_inobt_rec_incore	*recs;
+
+	/* Number of entries allocated for the @recs array. */
+	unsigned int			sz_recs;
+
+	/* Number of entries in the @recs array that are in use. */
+	unsigned int			nr_recs;
+
+	/* Inode walk function and data pointer. */
+	xfs_iwalk_fn			iwalk_fn;
+	void				*data;
+};
+
+/* Allocate memory for a walk. */
+STATIC int
+xfs_iwalk_alloc(
+	struct xfs_iwalk_ag	*iwag)
+{
+	size_t			size;
+
+	ASSERT(iwag->recs == NULL);
+	iwag->nr_recs = 0;
+
+	/* Allocate a prefetch buffer for inobt records. */
+	size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
+	iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+	if (iwag->recs == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Free memory we allocated for a walk. */
+STATIC void
+xfs_iwalk_free(
+	struct xfs_iwalk_ag	*iwag)
+{
+	kmem_free(iwag->recs);
+	iwag->recs = NULL;
+}
+
+/* For each inuse inode in each cached inobt record, call our function. */
+STATIC int
+xfs_iwalk_ag_recs(
+	struct xfs_iwalk_ag		*iwag)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	xfs_ino_t			ino;
+	unsigned int			i, j;
+	xfs_agnumber_t			agno;
+	int				error;
+
+	agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+	for (i = 0; i < iwag->nr_recs; i++) {
+		struct xfs_inobt_rec_incore	*irec = &iwag->recs[i];
+
+		trace_xfs_iwalk_ag_rec(mp, agno, irec);
+
+		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
+			/* Skip if this inode is free */
+			if (XFS_INOBT_MASK(j) & irec->ir_free)
+				continue;
+
+			/* Otherwise call our function. */
+			ino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino + j);
+			error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
+			if (error)
+				return error;
+		}
+	}
+
+	return 0;
+}
+
+/* Delete cursor and let go of AGI. */
+static inline void
+xfs_iwalk_del_inobt(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp,
+	int			error)
+{
+	if (*curpp) {
+		xfs_btree_del_cursor(*curpp, error);
+		*curpp = NULL;
+	}
+	if (*agi_bpp) {
+		xfs_trans_brelse(tp, *agi_bpp);
+		*agi_bpp = NULL;
+	}
+}
+
+/*
+ * Set ourselves up for walking inobt records starting from a given point in
+ * the filesystem.
+ *
+ * If caller passed in a nonzero start inode number, load the record from the
+ * inobt and make the record look like all the inodes before agino are free so
+ * that we skip them, and then move the cursor to the next inobt record.  This
+ * is how we support starting an iwalk in the middle of an inode chunk.
+ *
+ * If the caller passed in a start number of zero, move the cursor to the first
+ * inobt record.
+ *
+ * The caller is responsible for cleaning up the cursor and buffer pointer
+ * regardless of the error status.
+ */
+STATIC int
+xfs_iwalk_ag_start(
+	struct xfs_iwalk_ag	*iwag,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino,
+	struct xfs_btree_cur	**curpp,
+	struct xfs_buf		**agi_bpp,
+	int			*has_more)
+{
+	struct xfs_mount	*mp = iwag->mp;
+	struct xfs_trans	*tp = iwag->tp;
+	int			icount;
+	int			error;
+
+	/* Set up a fresh cursor and empty the inobt cache. */
+	iwag->nr_recs = 0;
+	error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+	if (error)
+		return error;
+
+	/* Starting at the beginning of the AG?  That's easy! */
+	if (agino == 0)
+		return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
+
+	/*
+	 * Otherwise, we have to grab the inobt record where we left off, stuff
+	 * the record into our cache, and then see if there are more records.
+	 * We require a lookup cache of at least two elements so that we don't
+	 * have to deal with tearing down the cursor to walk the records.
+	 */
+	error = xfs_bulkstat_grab_ichunk(*curpp, agino - 1, &icount,
+			&iwag->recs[iwag->nr_recs]);
+	if (error)
+		return error;
+	if (icount)
+		iwag->nr_recs++;
+
+	/*
+	 * The prefetch calculation is supposed to give us a large enough inobt
+	 * record cache that grab_ichunk can stage a partial first record and
+	 * the loop body can cache a record without having to check for cache
+	 * space until after it reads an inobt record.
+	 */
+	ASSERT(iwag->nr_recs < iwag->sz_recs);
+
+	return xfs_btree_increment(*curpp, 0, has_more);
+}
+
+/*
+ * The inobt record cache is full, so preserve the inobt cursor state and
+ * run callbacks on the cached inobt records.  When we're done, restore the
+ * cursor state to wherever the cursor would have been had the cache not been
+ * full (and therefore we could've just incremented the cursor) if *@has_more
+ * is true.  On exit, *@has_more will indicate whether or not the caller should
+ * try for more inode records.
+ */
+STATIC int
+xfs_iwalk_run_callbacks(
+	struct xfs_iwalk_ag		*iwag,
+	xfs_agnumber_t			agno,
+	struct xfs_btree_cur		**curpp,
+	struct xfs_buf			**agi_bpp,
+	int				*has_more)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	struct xfs_inobt_rec_incore	*irec;
+	xfs_agino_t			restart;
+	int				error;
+
+	ASSERT(iwag->nr_recs > 0);
+
+	/* Delete cursor but remember the last record we cached... */
+	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+	irec = &iwag->recs[iwag->nr_recs - 1];
+	restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+	error = xfs_iwalk_ag_recs(iwag);
+	if (error)
+		return error;
+
+	/* ...empty the cache... */
+	iwag->nr_recs = 0;
+
+	if (!has_more)
+		return 0;
+
+	/* ...and recreate the cursor just past where we left off. */
+	error = xfs_inobt_cur(mp, tp, agno, XFS_BTNUM_INO, curpp, agi_bpp);
+	if (error)
+		return error;
+
+	return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+}
+
+/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
+STATIC int
+xfs_iwalk_ag(
+	struct xfs_iwalk_ag		*iwag)
+{
+	struct xfs_mount		*mp = iwag->mp;
+	struct xfs_trans		*tp = iwag->tp;
+	struct xfs_buf			*agi_bp = NULL;
+	struct xfs_btree_cur		*cur = NULL;
+	xfs_agnumber_t			agno;
+	xfs_agino_t			agino;
+	int				has_more;
+	int				error = 0;
+
+	/* Set up our cursor at the right place in the inode btree. */
+	agno = XFS_INO_TO_AGNO(mp, iwag->startino);
+	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
+	error = xfs_iwalk_ag_start(iwag, agno, agino, &cur, &agi_bp, &has_more);
+
+	while (!error && has_more) {
+		struct xfs_inobt_rec_incore	*irec;
+
+		cond_resched();
+
+		/* Fetch the inobt record. */
+		irec = &iwag->recs[iwag->nr_recs];
+		error = xfs_inobt_get_rec(cur, irec, &has_more);
+		if (error || !has_more)
+			break;
+
+		/* No allocated inodes in this chunk; skip it. */
+		if (irec->ir_freecount == irec->ir_count) {
+			error = xfs_btree_increment(cur, 0, &has_more);
+			if (error)
+				break;
+			continue;
+		}
+
+		/*
+		 * Start readahead for this inode chunk in anticipation of
+		 * walking the inodes.
+		 */
+		xfs_bulkstat_ichunk_ra(mp, agno, irec);
+
+		/*
+		 * If there's space in the buffer for more records, increment
+		 * the btree cursor and grab more.
+		 */
+		if (++iwag->nr_recs < iwag->sz_recs) {
+			error = xfs_btree_increment(cur, 0, &has_more);
+			if (error || !has_more)
+				break;
+			continue;
+		}
+
+		/*
+		 * Otherwise, we need to save cursor state and run the callback
+		 * function on the cached records.  The run_callbacks function
+		 * is supposed to return a cursor pointing to the record where
+		 * we would be if we had been able to increment like above.
+		 */
+		ASSERT(has_more);
+		error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp,
+				&has_more);
+	}
+
+	if (iwag->nr_recs == 0 || error)
+		goto out;
+
+	/* Walk the unprocessed records in the cache. */
+	error = xfs_iwalk_run_callbacks(iwag, agno, &cur, &agi_bp, &has_more);
+
+out:
+	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
+	return error;
+}
+
+/*
+ * Given the number of inodes to prefetch, set the number of inobt records that
+ * we cache in memory, which controls the number of inodes we try to read
+ * ahead.
+ */
+static inline unsigned int
+xfs_iwalk_prefetch(
+	unsigned int		inode_records)
+{
+	return PAGE_SIZE * 4 / sizeof(struct xfs_inobt_rec_incore);
+}
+
+/*
+ * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
+ * will be called for each allocated inode, being passed the inode's number and
+ * @data.  @max_prefetch controls how many inobt records' worth of inodes we
+ * try to readahead.
+ */
+int
+xfs_iwalk(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	xfs_ino_t		startino,
+	xfs_iwalk_fn		iwalk_fn,
+	unsigned int		inode_records,
+	void			*data)
+{
+	struct xfs_iwalk_ag	iwag = {
+		.mp		= mp,
+		.tp		= tp,
+		.iwalk_fn	= iwalk_fn,
+		.data		= data,
+		.startino	= startino,
+		.sz_recs	= xfs_iwalk_prefetch(inode_records),
+	};
+	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+	int			error;
+
+	ASSERT(agno < mp->m_sb.sb_agcount);
+
+	error = xfs_iwalk_alloc(&iwag);
+	if (error)
+		return error;
+
+	for (; agno < mp->m_sb.sb_agcount; agno++) {
+		error = xfs_iwalk_ag(&iwag);
+		if (error)
+			break;
+		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+	}
+
+	xfs_iwalk_free(&iwag);
+	return error;
+}
author	Darrick J. Wong <darrick.wong@oracle.com>	2019-07-02 09:39:38 -0700
committer	Darrick J. Wong <darrick.wong@oracle.com>	2019-07-02 09:40:05 -0700
commit	a211432c27ffa32d9978f6c18f5af0c3f8ad2ad1 (patch)
tree	e9eea5c3d498f02fc9f2ccbaa3aced4169b7e6e8 /fs/xfs/xfs_iwalk.c
parent	5bb46e3e180d28c7ee6715f47c344be366a3fcc8 (diff)
download	linux-a211432c27ffa32d9978f6c18f5af0c3f8ad2ad1.tar.bz2