summaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c12
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c1642
-rw-r--r--fs/xfs/libxfs/xfs_attr.h198
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c64
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c37
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c167
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h58
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_btree.c150
-rw-r--r--fs/xfs/libxfs/xfs_btree.h26
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h25
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h9
-rw-r--r--fs/xfs/libxfs/xfs_defer.c54
-rw-r--r--fs/xfs/libxfs/xfs_defer.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c8
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h8
-rw-r--r--fs/xfs/libxfs/xfs_format.h189
-rw-r--r--fs/xfs/libxfs/xfs_fs.h41
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c118
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c51
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h76
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h79
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c75
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h50
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c14
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h13
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c161
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c9
-rw-r--r--fs/xfs/libxfs/xfs_sb.c80
-rw-r--r--fs/xfs/libxfs/xfs_shared.h24
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c225
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h16
-rw-r--r--fs/xfs/libxfs/xfs_types.h11
-rw-r--r--fs/xfs/scrub/bmap.c26
-rw-r--r--fs/xfs/scrub/common.c2
-rw-r--r--fs/xfs/scrub/inode.c20
-rw-r--r--fs/xfs/scrub/rtbitmap.c9
-rw-r--r--fs/xfs/xfs_acl.c4
-rw-r--r--fs/xfs/xfs_acl.h8
-rw-r--r--fs/xfs/xfs_attr_item.c824
-rw-r--r--fs/xfs/xfs_attr_item.h46
-rw-r--r--fs/xfs/xfs_attr_list.c1
-rw-r--r--fs/xfs/xfs_bmap_item.c27
-rw-r--r--fs/xfs/xfs_bmap_util.c27
-rw-r--r--fs/xfs/xfs_buf_item.h24
-rw-r--r--fs/xfs/xfs_dquot.c18
-rw-r--r--fs/xfs/xfs_dquot.h8
-rw-r--r--fs/xfs/xfs_error.c9
-rw-r--r--fs/xfs/xfs_error.h20
-rw-r--r--fs/xfs/xfs_extfree_item.c23
-rw-r--r--fs/xfs/xfs_file.c24
-rw-r--r--fs/xfs/xfs_filestream.c7
-rw-r--r--fs/xfs/xfs_fsmap.c6
-rw-r--r--fs/xfs/xfs_fsops.c7
-rw-r--r--fs/xfs/xfs_globals.c1
-rw-r--r--fs/xfs/xfs_icache.c9
-rw-r--r--fs/xfs/xfs_icreate_item.c1
-rw-r--r--fs/xfs/xfs_inode.c80
-rw-r--r--fs/xfs/xfs_inode.h29
-rw-r--r--fs/xfs/xfs_inode_item.c48
-rw-r--r--fs/xfs/xfs_inode_item_recover.c145
-rw-r--r--fs/xfs/xfs_ioctl.c7
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c33
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_itable.c15
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_iwalk.h2
-rw-r--r--fs/xfs/xfs_log.c807
-rw-r--r--fs/xfs/xfs_log.h90
-rw-r--r--fs/xfs/xfs_log_cil.c391
-rw-r--r--fs/xfs/xfs_log_priv.h89
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_message.c58
-rw-r--r--fs/xfs/xfs_message.h55
-rw-r--r--fs/xfs/xfs_mount.c91
-rw-r--r--fs/xfs/xfs_mount.h32
-rw-r--r--fs/xfs/xfs_ondisk.h2
-rw-r--r--fs/xfs/xfs_qm.c9
-rw-r--r--fs/xfs/xfs_qm.h5
-rw-r--r--fs/xfs/xfs_qm_syscalls.c26
-rw-r--r--fs/xfs/xfs_quotaops.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c25
-rw-r--r--fs/xfs/xfs_reflink.c100
-rw-r--r--fs/xfs/xfs_rmap_item.c25
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rtalloc.h9
-rw-r--r--fs/xfs/xfs_super.c18
-rw-r--r--fs/xfs/xfs_symlink.c5
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c24
-rw-r--r--fs/xfs/xfs_trace.h100
-rw-r--r--fs/xfs/xfs_trans.c52
-rw-r--r--fs/xfs/xfs_trans.h38
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_xattr.c2
104 files changed, 4689 insertions, 2675 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 04611a1068b4..b056cfc6398e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -102,6 +102,7 @@ xfs-y += xfs_log.o \
xfs_buf_item_recover.o \
xfs_dquot_item_recover.o \
xfs_extfree_item.o \
+ xfs_attr_item.o \
xfs_icreate_item.o \
xfs_inode_item.o \
xfs_inode_item_recover.o \
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index b52ed339727f..d3f2886fdc08 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2511,7 +2511,7 @@ __xfs_free_extent_later(
ASSERT(bno != NULLFSBLOCK);
ASSERT(len > 0);
- ASSERT(len <= MAXEXTLEN);
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
ASSERT(!isnullstartblock(bno));
agno = XFS_FSB_TO_AGNO(mp, bno);
agbno = XFS_FSB_TO_AGBNO(mp, bno);
@@ -2777,7 +2777,7 @@ xfs_alloc_get_freelist(
xfs_agblock_t bno;
__be32 *agfl_bno;
int error;
- int logflags;
+ uint32_t logflags;
struct xfs_mount *mp = tp->t_mountp;
struct xfs_perag *pag;
@@ -2830,9 +2830,9 @@ xfs_alloc_get_freelist(
*/
void
xfs_alloc_log_agf(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields) /* mask of fields to be logged (XFS_AGF_...) */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte offset */
int last; /* last byte offset */
@@ -2902,7 +2902,7 @@ xfs_alloc_put_freelist(
struct xfs_perag *pag;
__be32 *blockp;
int error;
- int logflags;
+ uint32_t logflags;
__be32 *agfl_bno;
int startoff;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index d4c057b764f9..84ca09b2223f 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -121,7 +121,7 @@ void
xfs_alloc_log_agf(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* buffer for a.g. freelist header */
- int fields);/* mask of fields to be logged (XFS_AGF_...) */
+ uint32_t fields);/* mask of fields to be logged (XFS_AGF_...) */
/*
* Interface for inode allocation to force the pag data to be initialized.
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 23523b802539..14ae0826bc15 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -24,6 +24,11 @@
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
+#include "xfs_attr_item.h"
+#include "xfs_log.h"
+
+struct kmem_cache *xfs_attri_cache;
+struct kmem_cache *xfs_attrd_cache;
/*
* xfs_attr.c
@@ -53,26 +58,22 @@ STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args, struct xfs_buf *bp);
*/
STATIC int xfs_attr_node_get(xfs_da_args_t *args);
STATIC void xfs_attr_restore_rmt_blk(struct xfs_da_args *args);
-STATIC int xfs_attr_node_addname(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_find_attr(struct xfs_delattr_context *dac);
-STATIC int xfs_attr_node_addname_clear_incomplete(
- struct xfs_delattr_context *dac);
+static int xfs_attr_node_try_addname(struct xfs_attr_item *attr);
+STATIC int xfs_attr_node_addname_find_attr(struct xfs_attr_item *attr);
+STATIC int xfs_attr_node_remove_attr(struct xfs_attr_item *attr);
STATIC int xfs_attr_node_hasname(xfs_da_args_t *args,
struct xfs_da_state **state);
-STATIC int xfs_attr_fillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
-STATIC int xfs_attr_set_iter(struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp);
-STATIC int xfs_attr_node_removename(struct xfs_da_args *args,
- struct xfs_da_state *state);
int
xfs_inode_hasattr(
struct xfs_inode *ip)
{
- if (!XFS_IFORK_Q(ip) ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0))
+ if (!XFS_IFORK_Q(ip))
+ return 0;
+ if (!ip->i_afp)
+ return 0;
+ if (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0)
return 0;
return 1;
}
@@ -97,6 +98,123 @@ xfs_attr_is_leaf(
return imap.br_startoff == 0 && imap.br_blockcount == 1;
}
+/*
+ * XXX (dchinner): name path state saving and refilling is an optimisation to
+ * avoid needing to look up name entries after rolling transactions removing
+ * remote xattr blocks between the name entry lookup and name entry removal.
+ * This optimisation got sidelined when combining the set and remove state
+ * machines, but the code has been left in place because it is worthwhile to
+ * restore the optimisation once the combined state machine paths have settled.
+ *
+ * This comment is a public service announcement to remind Future Dave that he
+ * still needs to restore this code to working order.
+ */
+#if 0
+/*
+ * Fill in the disk block numbers in the state structure for the buffers
+ * that are attached to the state structure.
+ * This is done so that we can quickly reattach ourselves to those buffers
+ * after some set of transaction commits have released these buffers.
+ */
+static int
+xfs_attr_fillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level;
+
+ trace_xfs_attr_fillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->bp) {
+ blk->disk_blkno = xfs_buf_daddr(blk->bp);
+ blk->bp = NULL;
+ } else {
+ blk->disk_blkno = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Reattach the buffers to the state structure based on the disk block
+ * numbers stored in the state structure.
+ * This is done after some set of transaction commits have released those
+ * buffers from our grip.
+ */
+static int
+xfs_attr_refillstate(xfs_da_state_t *state)
+{
+ xfs_da_state_path_t *path;
+ xfs_da_state_blk_t *blk;
+ int level, error;
+
+ trace_xfs_attr_refillstate(state->args);
+
+ /*
+ * Roll down the "path" in the state structure, storing the on-disk
+ * block number for those buffers in the "path".
+ */
+ path = &state->path;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ /*
+ * Roll down the "altpath" in the state structure, storing the on-disk
+ * block number for those buffers in the "altpath".
+ */
+ path = &state->altpath;
+ ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
+ for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
+ if (blk->disk_blkno) {
+ error = xfs_da3_node_read_mapped(state->args->trans,
+ state->args->dp, blk->disk_blkno,
+ &blk->bp, XFS_ATTR_FORK);
+ if (error)
+ return error;
+ } else {
+ blk->bp = NULL;
+ }
+ }
+
+ return 0;
+}
+#else
+static int xfs_attr_fillstate(xfs_da_state_t *state) { return 0; }
+#endif
+
/*========================================================================
* Overall external interface routines.
*========================================================================*/
@@ -166,7 +284,7 @@ xfs_attr_get(
/*
* Calculate how many blocks we need for the new attribute,
*/
-STATIC int
+int
xfs_attr_calc_size(
struct xfs_da_args *args,
int *local)
@@ -199,6 +317,33 @@ xfs_attr_calc_size(
return nblks;
}
+/* Initialize transaction reservation for attr operations */
+void
+xfs_init_attr_trans(
+ struct xfs_da_args *args,
+ struct xfs_trans_res *tres,
+ unsigned int *total)
+{
+ struct xfs_mount *mp = args->dp->i_mount;
+
+ if (args->value) {
+ tres->tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
+ M_RES(mp)->tr_attrsetrt.tr_logres *
+ args->total;
+ tres->tr_logcount = XFS_ATTRSET_LOG_COUNT;
+ tres->tr_logflags = XFS_TRANS_PERM_LOG_RES;
+ *total = args->total;
+ } else {
+ *tres = M_RES(mp)->tr_attrrm;
+ *total = XFS_ATTRRM_SPACE_RES(mp);
+ }
+}
+
+/*
+ * Add an attr to a shortform fork. If there is no space,
+ * xfs_attr_shortform_addname() will convert to leaf format and return -ENOSPC.
+ * to use.
+ */
STATIC int
xfs_attr_try_sf_addname(
struct xfs_inode *dp,
@@ -230,411 +375,470 @@ xfs_attr_try_sf_addname(
return error;
}
-/*
- * Check to see if the attr should be upgraded from non-existent or shortform to
- * single-leaf-block attribute list.
- */
-static inline bool
-xfs_attr_is_shortform(
- struct xfs_inode *ip)
+static int
+xfs_attr_sf_addname(
+ struct xfs_attr_item *attr)
{
- return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
- (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
- ip->i_afp->if_nextents == 0);
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ int error = 0;
+
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC) {
+ ASSERT(!error || error == -EEXIST);
+ attr->xattri_dela_state = XFS_DAS_DONE;
+ goto out;
+ }
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block. GROT:
+ * another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args, &attr->xattri_leaf_bp);
+ if (error)
+ return error;
+
+ /*
+ * Prevent the leaf buffer from being unlocked so that a concurrent AIL
+ * push cannot grab the half-baked leaf buffer and run into problems
+ * with the write verifier.
+ */
+ xfs_trans_bhold(args->trans, attr->xattri_leaf_bp);
+ attr->xattri_dela_state = XFS_DAS_LEAF_ADD;
+out:
+ trace_xfs_attr_sf_addname_return(attr->xattri_dela_state, args->dp);
+ return error;
}
/*
- * Checks to see if a delayed attribute transaction should be rolled. If so,
- * transaction is finished or rolled as needed.
+ * Handle the state change on completion of a multi-state attr operation.
+ *
+ * If the XFS_DA_OP_REPLACE flag is set, this means the operation was the first
+ * modification in a attr replace operation and we still have to do the second
+ * state, indicated by @replace_state.
+ *
+ * We consume the XFS_DA_OP_REPLACE flag so that when we are called again on
+ * completion of the second half of the attr replace operation we correctly
+ * signal that it is done.
*/
-STATIC int
-xfs_attr_trans_roll(
- struct xfs_delattr_context *dac)
+static enum xfs_delattr_state
+xfs_attr_complete_op(
+ struct xfs_attr_item *attr,
+ enum xfs_delattr_state replace_state)
{
- struct xfs_da_args *args = dac->da_args;
- int error;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
+
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ if (do_replace) {
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ return replace_state;
+ }
+ return XFS_DAS_DONE;
+}
+
+static int
+xfs_attr_leaf_addname(
+ struct xfs_attr_item *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ ASSERT(xfs_attr_is_leaf(args->dp));
+
+ /*
+ * Use the leaf buffer we may already hold locked as a result of
+ * a sf-to-leaf conversion. The held buffer is no longer valid
+ * after this call, regardless of the result.
+ */
+ error = xfs_attr_leaf_try_add(args, attr->xattri_leaf_bp);
+ attr->xattri_leaf_bp = NULL;
+
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
+ return error;
- if (dac->flags & XFS_DAC_DEFER_FINISH) {
/*
- * The caller wants us to finish all the deferred ops so that we
- * avoid pinning the log tail with a large number of deferred
- * ops.
+ * We're not in leaf format anymore, so roll the transaction and
+ * retry the add to the newly allocated node block.
*/
- dac->flags &= ~XFS_DAC_DEFER_FINISH;
- error = xfs_defer_finish(&args->trans);
- } else
- error = xfs_trans_roll_inode(&args->trans, args->dp);
+ attr->xattri_dela_state = XFS_DAS_NODE_ADD;
+ goto out;
+ }
+ if (error)
+ return error;
+ /*
+ * We need to commit and roll if we need to allocate remote xattr blocks
+ * or perform more xattr manipulations. Otherwise there is nothing more
+ * to do and we can return success.
+ */
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_LEAF_REPLACE);
+out:
+ trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp);
return error;
}
/*
- * Set the attribute specified in @args.
+ * Add an entry to a node format attr tree.
+ *
+ * Note that we might still have a leaf here - xfs_attr_is_leaf() cannot tell
+ * the difference between leaf + remote attr blocks and a node format tree,
+ * so we may still end up having to convert from leaf to node format here.
*/
-int
-xfs_attr_set_args(
- struct xfs_da_args *args)
+static int
+xfs_attr_node_addname(
+ struct xfs_attr_item *attr)
{
- struct xfs_buf *leaf_bp = NULL;
- int error = 0;
- struct xfs_delattr_context dac = {
- .da_args = args,
- };
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
- do {
- error = xfs_attr_set_iter(&dac, &leaf_bp);
- if (error != -EAGAIN)
- break;
+ ASSERT(!attr->xattri_leaf_bp);
+
+ error = xfs_attr_node_addname_find_attr(attr);
+ if (error)
+ return error;
- error = xfs_attr_trans_roll(&dac);
- if (error) {
- if (leaf_bp)
- xfs_trans_brelse(args->trans, leaf_bp);
+ error = xfs_attr_node_try_addname(attr);
+ if (error == -ENOSPC) {
+ error = xfs_attr3_leaf_to_node(args);
+ if (error)
return error;
- }
- } while (true);
+ /*
+ * No state change, we really are in node form now
+ * but we need the transaction rolled to continue.
+ */
+ goto out;
+ }
+ if (error)
+ return error;
+ if (args->rmtblkno)
+ attr->xattri_dela_state = XFS_DAS_NODE_SET_RMT;
+ else
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ XFS_DAS_NODE_REPLACE);
+out:
+ trace_xfs_attr_node_addname_return(attr->xattri_dela_state, args->dp);
return error;
}
-STATIC int
-xfs_attr_sf_addname(
- struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp)
+static int
+xfs_attr_rmtval_alloc(
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_inode *dp = args->dp;
+ struct xfs_da_args *args = attr->xattri_da_args;
int error = 0;
/*
- * Try to add the attr to the attribute list in the inode.
+ * If there was an out-of-line value, allocate the blocks we
+ * identified for its storage and copy the value. This is done
+ * after we create the attribute so that we don't overflow the
+ * maximum size of a transaction and/or hit a deadlock.
*/
- error = xfs_attr_try_sf_addname(dp, args);
+ if (attr->xattri_blkcnt > 0) {
+ error = xfs_attr_rmtval_set_blk(attr);
+ if (error)
+ return error;
+ /* Roll the transaction only if there is more to allocate. */
+ if (attr->xattri_blkcnt > 0)
+ goto out;
+ }
- /* Should only be 0, -EEXIST or -ENOSPC */
- if (error != -ENOSPC)
+ error = xfs_attr_rmtval_set_value(args);
+ if (error)
return error;
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ ++attr->xattri_dela_state);
/*
- * It won't fit in the shortform, transform to a leaf block. GROT:
- * another possible req'mt for a double-split btree op.
+ * If we are not doing a rename, we've finished the operation but still
+ * have to clear the incomplete flag protecting the new attr from
+ * exposing partially initialised state if we crash during creation.
*/
- error = xfs_attr_shortform_to_leaf(args, leaf_bp);
- if (error)
- return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ error = xfs_attr3_leaf_clearflag(args);
+out:
+ trace_xfs_attr_rmtval_alloc(attr->xattri_dela_state, args->dp);
+ return error;
+}
+
+/*
+ * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
+ * for later deletion of the entry.
+ */
+static int
+xfs_attr_leaf_mark_incomplete(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ int error;
/*
- * Prevent the leaf buffer from being unlocked so that a concurrent AIL
- * push cannot grab the half-baked leaf buffer and run into problems
- * with the write verifier.
+ * Fill in disk block numbers in the state structure
+ * so that we can get the buffers back after we commit
+ * several transactions in the following calls.
*/
- xfs_trans_bhold(args->trans, *leaf_bp);
+ error = xfs_attr_fillstate(state);
+ if (error)
+ return error;
/*
- * We're still in XFS_DAS_UNINIT state here. We've converted
- * the attr fork to leaf format and will restart with the leaf
- * add.
+ * Mark the attribute as INCOMPLETE
*/
- trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp);
- dac->flags |= XFS_DAC_DEFER_FINISH;
- return -EAGAIN;
+ return xfs_attr3_leaf_setflag(args);
}
/*
- * Set the attribute specified in @args.
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled. Calling functions will need
- * to handle this, and recall the function until a successful error code is
- * returned.
+ * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
+ * the blocks are valid. Attr keys with remote blocks will be marked
+ * incomplete.
*/
-int
-xfs_attr_set_iter(
- struct xfs_delattr_context *dac,
- struct xfs_buf **leaf_bp)
+static
+int xfs_attr_node_removename_setup(
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_inode *dp = args->dp;
- struct xfs_buf *bp = NULL;
- int forkoff, error = 0;
-
- /* State machine switch */
- switch (dac->dela_state) {
- case XFS_DAS_UNINIT:
- /*
- * If the fork is shortform, attempt to add the attr. If there
- * is no space, this converts to leaf format and returns
- * -EAGAIN with the leaf buffer held across the roll. The caller
- * will deal with a transaction roll error, but otherwise
- * release the hold once we return with a clean transaction.
- */
- if (xfs_attr_is_shortform(dp))
- return xfs_attr_sf_addname(dac, leaf_bp);
- if (*leaf_bp != NULL) {
- xfs_trans_bhold_release(args->trans, *leaf_bp);
- *leaf_bp = NULL;
- }
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state **state = &attr->xattri_da_state;
+ int error;
- if (xfs_attr_is_leaf(dp)) {
- error = xfs_attr_leaf_try_add(args, *leaf_bp);
- if (error == -ENOSPC) {
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- return error;
-
- /*
- * Finish any deferred work items and roll the
- * transaction once more. The goal here is to
- * call node_addname with the inode and
- * transaction in the same state (inode locked
- * and joined, transaction clean) no matter how
- * we got to this step.
- *
- * At this point, we are still in
- * XFS_DAS_UNINIT, but when we come back, we'll
- * be a node, so we'll fall down into the node
- * handling code below
- */
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- } else if (error) {
- return error;
- }
+ error = xfs_attr_node_hasname(args, state);
+ if (error != -EEXIST)
+ goto out;
+ error = 0;
- dac->dela_state = XFS_DAS_FOUND_LBLK;
- } else {
- error = xfs_attr_node_addname_find_attr(dac);
- if (error)
- return error;
+ ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
+ ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
+ XFS_ATTR_LEAF_MAGIC);
- error = xfs_attr_node_addname(dac);
- if (error)
- return error;
+ error = xfs_attr_leaf_mark_incomplete(args, *state);
+ if (error)
+ goto out;
+ if (args->rmtblkno > 0)
+ error = xfs_attr_rmtval_invalidate(args);
+out:
+ if (error)
+ xfs_da_state_free(*state);
- dac->dela_state = XFS_DAS_FOUND_NBLK;
- }
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- case XFS_DAS_FOUND_LBLK:
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
+ return error;
+}
- /* Open coded xfs_attr_rmtval_set without trans handling */
- if ((dac->flags & XFS_DAC_LEAF_ADDNAME_INIT) == 0) {
- dac->flags |= XFS_DAC_LEAF_ADDNAME_INIT;
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_find_space(dac);
- if (error)
- return error;
- }
- }
+/*
+ * Remove the original attr we have just replaced. This is dependent on the
+ * original lookup and insert placing the old attr in args->blkno/args->index
+ * and the new attr in args->blkno2/args->index2.
+ */
+static int
+xfs_attr_leaf_remove_attr(
+ struct xfs_attr_item *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp = NULL;
+ int forkoff;
+ int error;
- /*
- * Repeat allocating remote blocks for the attr value until
- * blkcnt drops to zero.
- */
- if (dac->blkcnt > 0) {
- error = xfs_attr_rmtval_set_blk(dac);
- if (error)
- return error;
- trace_xfs_attr_set_iter_return(dac->dela_state,
- args->dp);
- return -EAGAIN;
- }
+ error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
+ &bp);
+ if (error)
+ return error;
- error = xfs_attr_rmtval_set_value(args);
- if (error)
- return error;
+ xfs_attr3_leaf_remove(bp, args);
- /*
- * If this is not a rename, clear the incomplete flag and we're
- * done.
- */
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- return error;
- }
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff)
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
- /*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
- */
- error = xfs_attr3_leaf_flipflags(args);
- if (error)
- return error;
- /*
- * Commit the flag value change and start the next trans in
- * series.
- */
- dac->dela_state = XFS_DAS_FLIP_LFLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- case XFS_DAS_FLIP_LFLAG:
- /*
- * Dismantle the "old" attribute/value pair by removing a
- * "remote" value (if it exists).
- */
- xfs_attr_restore_rmt_blk(args);
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
+ return error;
+}
- fallthrough;
- case XFS_DAS_RM_LBLK:
- /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
- dac->dela_state = XFS_DAS_RM_LBLK;
- if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- if (error)
- return error;
+/*
+ * Shrink an attribute from leaf to shortform. Used by the node format remove
+ * path when the node format collapses to a single block and so we have to check
+ * if it can be collapsed further.
+ */
+static int
+xfs_attr_leaf_shrink(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ struct xfs_buf *bp;
+ int forkoff;
+ int error;
- dac->dela_state = XFS_DAS_RD_LEAF;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- }
+ if (!xfs_attr_is_leaf(dp))
+ return 0;
- fallthrough;
- case XFS_DAS_RD_LEAF:
- /*
- * This is the last step for leaf format. Read the block with
- * the old attr, remove the old attr, check for shortform
- * conversion and return.
- */
- error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno,
- &bp);
- if (error)
- return error;
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
- xfs_attr3_leaf_remove(bp, args);
+ forkoff = xfs_attr_shortform_allfit(bp, dp);
+ if (forkoff) {
+ error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
+ /* bp is gone due to xfs_da_shrink_inode */
+ } else {
+ xfs_trans_brelse(args->trans, bp);
+ }
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff)
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
+ return error;
+}
- return error;
+/*
+ * Run the attribute operation specified in @attr.
+ *
+ * This routine is meant to function as a delayed operation and will set the
+ * state to XFS_DAS_DONE when the operation is complete. Calling functions will
+ * need to handle this, and recall the function until either an error or
+ * XFS_DAS_DONE is detected.
+ */
+int
+xfs_attr_set_iter(
+ struct xfs_attr_item *attr)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error = 0;
- case XFS_DAS_FOUND_NBLK:
- /*
- * Find space for remote blocks and fall into the allocation
- * state.
- */
- if (args->rmtblkno > 0) {
- error = xfs_attr_rmtval_find_space(dac);
- if (error)
- return error;
+ /* State machine switch */
+next_state:
+ switch (attr->xattri_dela_state) {
+ case XFS_DAS_UNINIT:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ case XFS_DAS_SF_ADD:
+ return xfs_attr_sf_addname(attr);
+ case XFS_DAS_LEAF_ADD:
+ return xfs_attr_leaf_addname(attr);
+ case XFS_DAS_NODE_ADD:
+ return xfs_attr_node_addname(attr);
+
+ case XFS_DAS_SF_REMOVE:
+ error = xfs_attr_sf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_LEAF_REMOVE:
+ error = xfs_attr_leaf_removename(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+ case XFS_DAS_NODE_REMOVE:
+ error = xfs_attr_node_removename_setup(attr);
+ if (error == -ENOATTR &&
+ (args->op_flags & XFS_DA_OP_RECOVERY)) {
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ error = 0;
+ break;
}
+ if (error)
+ return error;
+ attr->xattri_dela_state = XFS_DAS_NODE_REMOVE_RMT;
+ if (args->rmtblkno == 0)
+ attr->xattri_dela_state++;
+ break;
+ case XFS_DAS_LEAF_SET_RMT:
+ case XFS_DAS_NODE_SET_RMT:
+ error = xfs_attr_rmtval_find_space(attr);
+ if (error)
+ return error;
+ attr->xattri_dela_state++;
fallthrough;
- case XFS_DAS_ALLOC_NODE:
- /*
- * If there was an out-of-line value, allocate the blocks we
- * identified for its storage and copy the value. This is done
- * after we create the attribute so that we don't overflow the
- * maximum size of a transaction and/or hit a deadlock.
- */
- dac->dela_state = XFS_DAS_ALLOC_NODE;
- if (args->rmtblkno > 0) {
- if (dac->blkcnt > 0) {
- error = xfs_attr_rmtval_set_blk(dac);
- if (error)
- return error;
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- error = xfs_attr_rmtval_set_value(args);
- if (error)
- return error;
- }
- /*
- * If this was not a rename, clear the incomplete flag and we're
- * done.
- */
- if (!(args->op_flags & XFS_DA_OP_RENAME)) {
- if (args->rmtblkno > 0)
- error = xfs_attr3_leaf_clearflag(args);
- goto out;
- }
+ case XFS_DAS_LEAF_ALLOC_RMT:
+ case XFS_DAS_NODE_ALLOC_RMT:
+ error = xfs_attr_rmtval_alloc(attr);
+ if (error)
+ return error;
+ if (attr->xattri_dela_state == XFS_DAS_DONE)
+ break;
+ goto next_state;
+ case XFS_DAS_LEAF_REPLACE:
+ case XFS_DAS_NODE_REPLACE:
/*
- * If this is an atomic rename operation, we must "flip" the
- * incomplete flags on the "new" and "old" attribute/value pairs
- * so that one disappears and one appears atomically. Then we
- * must remove the "old" attribute/value pair.
- *
- * In a separate transaction, set the incomplete flag on the
- * "old" attr and clear the incomplete flag on the "new" attr.
+ * We must "flip" the incomplete flags on the "new" and "old"
+ * attribute/value pairs so that one disappears and one appears
+ * atomically.
*/
error = xfs_attr3_leaf_flipflags(args);
if (error)
- goto out;
+ return error;
/*
- * Commit the flag value change and start the next trans in
- * series
+ * We must commit the flag value change now to make it atomic
+ * and then we can start the next trans in series at REMOVE_OLD.
*/
- dac->dela_state = XFS_DAS_FLIP_NFLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
+ attr->xattri_dela_state++;
+ break;
- case XFS_DAS_FLIP_NFLAG:
+ case XFS_DAS_LEAF_REMOVE_OLD:
+ case XFS_DAS_NODE_REMOVE_OLD:
/*
- * Dismantle the "old" attribute/value pair by removing a
- * "remote" value (if it exists).
+ * If we have a remote attr, start the process of removing it
+ * by invalidating any cached buffers.
+ *
+ * If we don't have a remote attr, we skip the remote block
+ * removal state altogether with a second state increment.
*/
xfs_attr_restore_rmt_blk(args);
-
- error = xfs_attr_rmtval_invalidate(args);
- if (error)
- return error;
-
- fallthrough;
- case XFS_DAS_RM_NBLK:
- /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */
- dac->dela_state = XFS_DAS_RM_NBLK;
if (args->rmtblkno) {
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN)
- trace_xfs_attr_set_iter_return(
- dac->dela_state, args->dp);
-
+ error = xfs_attr_rmtval_invalidate(args);
if (error)
return error;
+ } else {
+ attr->xattri_dela_state++;
+ }
+
+ attr->xattri_dela_state++;
+ goto next_state;
- dac->dela_state = XFS_DAS_CLR_FLAG;
- trace_xfs_attr_set_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
+ case XFS_DAS_LEAF_REMOVE_RMT:
+ case XFS_DAS_NODE_REMOVE_RMT:
+ error = xfs_attr_rmtval_remove(attr);
+ if (error == -EAGAIN) {
+ error = 0;
+ break;
}
+ if (error)
+ return error;
- fallthrough;
- case XFS_DAS_CLR_FLAG:
/*
- * The last state for node format. Look up the old attr and
- * remove it.
+ * We've finished removing the remote attr blocks, so commit the
+ * transaction and move on to removing the attr name from the
+ * leaf/node block. Removing the attr might require a full
+ * transaction reservation for btree block freeing, so we
+ * can't do that in the same transaction where we removed the
+ * remote attr blocks.
*/
- error = xfs_attr_node_addname_clear_incomplete(dac);
+ attr->xattri_dela_state++;
+ break;
+
+ case XFS_DAS_LEAF_REMOVE_ATTR:
+ error = xfs_attr_leaf_remove_attr(attr);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
+ break;
+
+ case XFS_DAS_NODE_REMOVE_ATTR:
+ error = xfs_attr_node_remove_attr(attr);
+ if (!error)
+ error = xfs_attr_leaf_shrink(args);
+ attr->xattri_dela_state = xfs_attr_complete_op(attr,
+ xfs_attr_init_add_state(args));
break;
default:
ASSERT(0);
break;
}
-out:
+
+ trace_xfs_attr_set_iter_return(attr->xattri_dela_state, args->dp);
return error;
}
@@ -668,30 +872,79 @@ xfs_attr_lookup(
return xfs_attr_node_hasname(args, NULL);
}
-/*
- * Remove the attribute specified in @args.
- */
-int
-xfs_attr_remove_args(
+static int
+xfs_attr_item_init(
+ struct xfs_da_args *args,
+ unsigned int op_flags, /* op flag (set or remove) */
+ struct xfs_attr_item **attr) /* new xfs_attr_item */
+{
+
+ struct xfs_attr_item *new;
+
+ new = kmem_zalloc(sizeof(struct xfs_attr_item), KM_NOFS);
+ new->xattri_op_flags = op_flags;
+ new->xattri_da_args = args;
+
+ *attr = new;
+ return 0;
+}
+
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_add(
struct xfs_da_args *args)
{
- int error;
- struct xfs_delattr_context dac = {
- .da_args = args,
- };
+ struct xfs_attr_item *new;
+ int error = 0;
- do {
- error = xfs_attr_remove_iter(&dac);
- if (error != -EAGAIN)
- break;
+ error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_SET, &new);
+ if (error)
+ return error;
- error = xfs_attr_trans_roll(&dac);
- if (error)
- return error;
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
- } while (true);
+ return 0;
+}
- return error;
+/* Sets an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_replace(
+ struct xfs_da_args *args)
+{
+ struct xfs_attr_item *new;
+ int error = 0;
+
+ error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REPLACE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
+
+ return 0;
+}
+
+/* Removes an attribute for an inode as a deferred operation */
+static int
+xfs_attr_defer_remove(
+ struct xfs_da_args *args)
+{
+
+ struct xfs_attr_item *new;
+ int error;
+
+ error = xfs_attr_item_init(args, XFS_ATTR_OP_FLAGS_REMOVE, &new);
+ if (error)
+ return error;
+
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
+
+ return 0;
}
/*
@@ -709,6 +962,7 @@ xfs_attr_set(
int error, local;
int rmt_blks = 0;
unsigned int total;
+ int delayed = xfs_has_larp(mp);
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
@@ -730,8 +984,6 @@ xfs_attr_set(
if (args->value) {
XFS_STATS_INC(mp, xs_attr_set);
-
- args->op_flags |= XFS_DA_OP_ADDNAME;
args->total = xfs_attr_calc_size(args, &local);
/*
@@ -748,61 +1000,68 @@ xfs_attr_set(
return error;
}
- tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres +
- M_RES(mp)->tr_attrsetrt.tr_logres *
- args->total;
- tres.tr_logcount = XFS_ATTRSET_LOG_COUNT;
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- total = args->total;
-
if (!local)
rmt_blks = xfs_attr3_rmt_blocks(mp, args->valuelen);
} else {
XFS_STATS_INC(mp, xs_attr_remove);
-
- tres = M_RES(mp)->tr_attrrm;
- total = XFS_ATTRRM_SPACE_RES(mp);
rmt_blks = xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX);
}
+ if (delayed) {
+ error = xfs_attr_use_log_assist(mp);
+ if (error)
+ return error;
+ }
+
/*
* Root fork attributes can use reserved data blocks for this
* operation if necessary
*/
+ xfs_init_attr_trans(args, &tres, &total);
error = xfs_trans_alloc_inode(dp, &tres, total, 0, rsvd, &args->trans);
if (error)
- return error;
+ goto drop_incompat;
if (args->value || xfs_inode_hasattr(dp)) {
error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(args->trans, dp,
+ XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
if (error)
goto out_trans_cancel;
}
error = xfs_attr_lookup(args);
- if (args->value) {
- if (error == -EEXIST && (args->attr_flags & XATTR_CREATE))
- goto out_trans_cancel;
- if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out_trans_cancel;
- if (error != -ENOATTR && error != -EEXIST)
+ switch (error) {
+ case -EEXIST:
+ /* if no value, we are performing a remove operation */
+ if (!args->value) {
+ error = xfs_attr_defer_remove(args);
+ break;
+ }
+ /* Pure create fails if the attr already exists */
+ if (args->attr_flags & XATTR_CREATE)
goto out_trans_cancel;
- error = xfs_attr_set_args(args);
- if (error)
- goto out_trans_cancel;
- /* shortform attribute has already been committed */
- if (!args->trans)
- goto out_unlock;
- } else {
- if (error != -EEXIST)
+ error = xfs_attr_defer_replace(args);
+ break;
+ case -ENOATTR:
+ /* Can't remove what isn't there. */
+ if (!args->value)
goto out_trans_cancel;
- error = xfs_attr_remove_args(args);
- if (error)
+ /* Pure replace fails if no existing attr to replace. */
+ if (args->attr_flags & XATTR_REPLACE)
goto out_trans_cancel;
+
+ error = xfs_attr_defer_add(args);
+ break;
+ default:
+ goto out_trans_cancel;
}
+ if (error)
+ goto out_trans_cancel;
/*
* If this is a synchronous mount, make sure that the
@@ -821,6 +1080,9 @@ xfs_attr_set(
error = xfs_trans_commit(args->trans);
out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
+drop_incompat:
+ if (delayed)
+ xlog_drop_incompat_feat(mp->m_log);
return error;
out_trans_cancel:
@@ -829,6 +1091,40 @@ out_trans_cancel:
goto out_unlock;
}
+int __init
+xfs_attri_init_cache(void)
+{
+ xfs_attri_cache = kmem_cache_create("xfs_attri",
+ sizeof(struct xfs_attri_log_item),
+ 0, 0, NULL);
+
+ return xfs_attri_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attri_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_attri_cache);
+ xfs_attri_cache = NULL;
+}
+
+int __init
+xfs_attrd_init_cache(void)
+{
+ xfs_attrd_cache = kmem_cache_create("xfs_attrd",
+ sizeof(struct xfs_attrd_log_item),
+ 0, 0, NULL);
+
+ return xfs_attrd_cache != NULL ? 0 : -ENOMEM;
+}
+
+void
+xfs_attrd_destroy_cache(void)
+{
+ kmem_cache_destroy(xfs_attrd_cache);
+ xfs_attrd_cache = NULL;
+}
+
/*========================================================================
* External routines when attribute list is inside the inode
*========================================================================*/
@@ -845,28 +1141,41 @@ static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
* Add a name to the shortform attribute list structure
* This is the external routine.
*/
-STATIC int
-xfs_attr_shortform_addname(xfs_da_args_t *args)
+static int
+xfs_attr_shortform_addname(
+ struct xfs_da_args *args)
{
- int newsize, forkoff, retval;
+ int newsize, forkoff;
+ int error;
trace_xfs_attr_sf_addname(args);
- retval = xfs_attr_shortform_lookup(args);
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- return retval;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
- return retval;
- retval = xfs_attr_sf_removename(args);
- if (retval)
- return retval;
+ error = xfs_attr_shortform_lookup(args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ return error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
+ return error;
+
+ error = xfs_attr_sf_removename(args);
+ if (error)
+ return error;
+
/*
- * Since we have removed the old attr, clear ATTR_REPLACE so
- * that the leaf format add routine won't trip over the attr
- * not being around.
+ * Since we have removed the old attr, clear XFS_DA_OP_REPLACE
+ * so that the new attr doesn't fit in shortform format, the
+ * leaf format add routine won't trip over the attr not being
+ * around.
*/
- args->attr_flags &= ~XATTR_REPLACE;
+ args->op_flags &= ~XFS_DA_OP_REPLACE;
+ break;
+ case 0:
+ break;
+ default:
+ return error;
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
@@ -889,8 +1198,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
* External routines when attribute list is one block
*========================================================================*/
-/* Store info about a remote block */
-STATIC void
+/* Save the current remote block info and clear the current pointers. */
+static void
xfs_attr_save_rmt_blk(
struct xfs_da_args *args)
{
@@ -899,10 +1208,13 @@ xfs_attr_save_rmt_blk(
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
args->rmtvaluelen2 = args->rmtvaluelen;
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
/* Set stored info about a remote block */
-STATIC void
+static void
xfs_attr_restore_rmt_blk(
struct xfs_da_args *args)
{
@@ -928,45 +1240,54 @@ xfs_attr_leaf_try_add(
struct xfs_da_args *args,
struct xfs_buf *bp)
{
- int retval;
+ int error;
/*
- * Look up the given attribute in the leaf block. Figure out if
- * the given flags produce an error or call for an atomic rename.
+ * If the caller provided a buffer to us, it is locked and held in
+ * the transaction because it just did a shortform to leaf conversion.
+ * Hence we don't need to read it again. Otherwise read in the leaf
+ * buffer.
*/
- retval = xfs_attr_leaf_hasname(args, &bp);
- if (retval != -ENOATTR && retval != -EEXIST)
- return retval;
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto out_brelse;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
+ if (bp) {
+ xfs_trans_bhold_release(args->trans, bp);
+ } else {
+ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Look up the xattr name to set the insertion point for the new xattr.
+ */
+ error = xfs_attr3_leaf_lookup_int(bp, args);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto out_brelse;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
goto out_brelse;
trace_xfs_attr_leaf_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
- xfs_attr_save_rmt_blk(args);
-
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto out_brelse;
}
- /*
- * Add the attribute to the leaf block
- */
return xfs_attr3_leaf_add(bp, args);
out_brelse:
xfs_trans_brelse(args->trans, bp);
- return retval;
+ return error;
}
/*
@@ -1012,9 +1333,10 @@ xfs_attr_leaf_removename(
dp = args->dp;
error = xfs_attr_leaf_hasname(args, &bp);
-
if (error == -ENOATTR) {
xfs_trans_brelse(args->trans, bp);
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
return error;
} else if (error != -EEXIST)
return error;
@@ -1098,46 +1420,45 @@ xfs_attr_node_hasname(
STATIC int
xfs_attr_node_addname_find_attr(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- int retval;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
/*
* Search to see if name already exists, and get back a pointer
* to where it should go.
*/
- retval = xfs_attr_node_hasname(args, &dac->da_state);
- if (retval != -ENOATTR && retval != -EEXIST)
- goto error;
-
- if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE))
- goto error;
- if (retval == -EEXIST) {
- if (args->attr_flags & XATTR_CREATE)
+ error = xfs_attr_node_hasname(args, &attr->xattri_da_state);
+ switch (error) {
+ case -ENOATTR:
+ if (args->op_flags & XFS_DA_OP_REPLACE)
+ goto error;
+ break;
+ case -EEXIST:
+ if (!(args->op_flags & XFS_DA_OP_REPLACE))
goto error;
- trace_xfs_attr_node_replace(args);
-
- /* save the attribute state for later removal*/
- args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
- xfs_attr_save_rmt_blk(args);
+ trace_xfs_attr_node_replace(args);
/*
- * clear the remote attr state now that it is saved so that the
- * values reflect the state of the attribute we are about to
+ * Save the existing remote attr state so that the current
+ * values reflect the state of the new attribute we are about to
* add, not the attribute we just found and will remove later.
*/
- args->rmtblkno = 0;
- args->rmtblkcnt = 0;
- args->rmtvaluelen = 0;
+ xfs_attr_save_rmt_blk(args);
+ break;
+ case 0:
+ break;
+ default:
+ goto error;
}
return 0;
error:
- if (dac->da_state)
- xfs_da_state_free(dac->da_state);
- return retval;
+ if (attr->xattri_da_state)
+ xfs_da_state_free(attr->xattri_da_state);
+ return error;
}
/*
@@ -1146,21 +1467,13 @@ error:
* This will involve walking down the Btree, and may involve splitting
* leaf nodes and even splitting intermediate nodes up to and including
* the root node (a special case of an intermediate node).
- *
- * "Remote" attribute values confuse the issue and atomic rename operations
- * add a whole extra layer of confusion on top of that.
- *
- * This routine is meant to function as a delayed operation, and may return
- * -EAGAIN when the transaction needs to be rolled. Calling functions will need
- * to handle this, and recall the function until a successful error code is
- *returned.
*/
-STATIC int
-xfs_attr_node_addname(
- struct xfs_delattr_context *dac)
+static int
+xfs_attr_node_try_addname(
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = dac->da_state;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_da_state *state = attr->xattri_da_state;
struct xfs_da_state_blk *blk;
int error;
@@ -1175,25 +1488,9 @@ xfs_attr_node_addname(
/*
* Its really a single leaf node, but it had
* out-of-line values so it looked like it *might*
- * have been a b-tree.
+ * have been a b-tree. Let the caller deal with this.
*/
- xfs_da_state_free(state);
- state = NULL;
- error = xfs_attr3_leaf_to_node(args);
- if (error)
- goto out;
-
- /*
- * Now that we have converted the leaf to a node, we can
- * roll the transaction, and try xfs_attr3_leaf_add
- * again on re-entry. No need to set dela_state to do
- * this. dela_state is still unset by this function at
- * this point.
- */
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_node_addname_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
+ goto out;
}
/*
@@ -1205,7 +1502,6 @@ xfs_attr_node_addname(
error = xfs_da3_split(state);
if (error)
goto out;
- dac->flags |= XFS_DAC_DEFER_FINISH;
} else {
/*
* Addition succeeded, update Btree hashvals.
@@ -1214,24 +1510,42 @@ xfs_attr_node_addname(
}
out:
- if (state)
- xfs_da_state_free(state);
+ xfs_da_state_free(state);
return error;
}
+static int
+xfs_attr_node_removename(
+ struct xfs_da_args *args,
+ struct xfs_da_state *state)
+{
+ struct xfs_da_state_blk *blk;
+ int retval;
-STATIC int
-xfs_attr_node_addname_clear_incomplete(
- struct xfs_delattr_context *dac)
+ /*
+ * Remove the name and update the hashvals in the tree.
+ */
+ blk = &state->path.blk[state->path.active-1];
+ ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
+ retval = xfs_attr3_leaf_remove(blk->bp, args);
+ xfs_da3_fixhashpath(state, &state->path);
+
+ return retval;
+}
+
+static int
+xfs_attr_node_remove_attr(
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
struct xfs_da_state *state = NULL;
int retval = 0;
int error = 0;
/*
- * Re-find the "old" attribute entry after any split ops. The INCOMPLETE
- * flag means that we will find the "old" attr, not the "new" one.
+ * The attr we are removing has already been marked incomplete, so
+ * we need to set the filter appropriately to re-find the "old"
+ * attribute entry after any split ops.
*/
args->attr_filter |= XFS_ATTR_INCOMPLETE;
state = xfs_da_state_alloc(args);
@@ -1261,362 +1575,6 @@ out:
}
/*
- * Shrink an attribute from leaf to shortform
- */
-STATIC int
-xfs_attr_node_shrink(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- struct xfs_inode *dp = args->dp;
- int error, forkoff;
- struct xfs_buf *bp;
-
- /*
- * Have to get rid of the copy of this dabuf in the state.
- */
- ASSERT(state->path.active == 1);
- ASSERT(state->path.blk[0].bp);
- state->path.blk[0].bp = NULL;
-
- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp);
- if (error)
- return error;
-
- forkoff = xfs_attr_shortform_allfit(bp, dp);
- if (forkoff) {
- error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
- /* bp is gone due to xfs_da_shrink_inode */
- } else
- xfs_trans_brelse(args->trans, bp);
-
- return error;
-}
-
-/*
- * Mark an attribute entry INCOMPLETE and save pointers to the relevant buffers
- * for later deletion of the entry.
- */
-STATIC int
-xfs_attr_leaf_mark_incomplete(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- int error;
-
- /*
- * Fill in disk block numbers in the state structure
- * so that we can get the buffers back after we commit
- * several transactions in the following calls.
- */
- error = xfs_attr_fillstate(state);
- if (error)
- return error;
-
- /*
- * Mark the attribute as INCOMPLETE
- */
- return xfs_attr3_leaf_setflag(args);
-}
-
-/*
- * Initial setup for xfs_attr_node_removename. Make sure the attr is there and
- * the blocks are valid. Attr keys with remote blocks will be marked
- * incomplete.
- */
-STATIC
-int xfs_attr_node_removename_setup(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state **state = &dac->da_state;
- int error;
-
- error = xfs_attr_node_hasname(args, state);
- if (error != -EEXIST)
- goto out;
- error = 0;
-
- ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL);
- ASSERT((*state)->path.blk[(*state)->path.active - 1].magic ==
- XFS_ATTR_LEAF_MAGIC);
-
- if (args->rmtblkno > 0) {
- error = xfs_attr_leaf_mark_incomplete(args, *state);
- if (error)
- goto out;
-
- error = xfs_attr_rmtval_invalidate(args);
- }
-out:
- if (error)
- xfs_da_state_free(*state);
-
- return error;
-}
-
-STATIC int
-xfs_attr_node_removename(
- struct xfs_da_args *args,
- struct xfs_da_state *state)
-{
- struct xfs_da_state_blk *blk;
- int retval;
-
- /*
- * Remove the name and update the hashvals in the tree.
- */
- blk = &state->path.blk[state->path.active-1];
- ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
- retval = xfs_attr3_leaf_remove(blk->bp, args);
- xfs_da3_fixhashpath(state, &state->path);
-
- return retval;
-}
-
-/*
- * Remove the attribute specified in @args.
- *
- * This will involve walking down the Btree, and may involve joining
- * leaf nodes and even joining intermediate nodes up to and including
- * the root node (a special case of an intermediate node).
- *
- * This routine is meant to function as either an in-line or delayed operation,
- * and may return -EAGAIN when the transaction needs to be rolled. Calling
- * functions will need to handle this, and call the function until a
- * successful error code is returned.
- */
-int
-xfs_attr_remove_iter(
- struct xfs_delattr_context *dac)
-{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_da_state *state = dac->da_state;
- int retval, error = 0;
- struct xfs_inode *dp = args->dp;
-
- trace_xfs_attr_node_removename(args);
-
- switch (dac->dela_state) {
- case XFS_DAS_UNINIT:
- if (!xfs_inode_hasattr(dp))
- return -ENOATTR;
-
- /*
- * Shortform or leaf formats don't require transaction rolls and
- * thus state transitions. Call the right helper and return.
- */
- if (dp->i_afp->if_format == XFS_DINODE_FMT_LOCAL)
- return xfs_attr_sf_removename(args);
-
- if (xfs_attr_is_leaf(dp))
- return xfs_attr_leaf_removename(args);
-
- /*
- * Node format may require transaction rolls. Set up the
- * state context and fall into the state machine.
- */
- if (!dac->da_state) {
- error = xfs_attr_node_removename_setup(dac);
- if (error)
- return error;
- state = dac->da_state;
- }
-
- fallthrough;
- case XFS_DAS_RMTBLK:
- dac->dela_state = XFS_DAS_RMTBLK;
-
- /*
- * If there is an out-of-line value, de-allocate the blocks.
- * This is done before we remove the attribute so that we don't
- * overflow the maximum size of a transaction and/or hit a
- * deadlock.
- */
- if (args->rmtblkno > 0) {
- /*
- * May return -EAGAIN. Roll and repeat until all remote
- * blocks are removed.
- */
- error = xfs_attr_rmtval_remove(dac);
- if (error == -EAGAIN) {
- trace_xfs_attr_remove_iter_return(
- dac->dela_state, args->dp);
- return error;
- } else if (error) {
- goto out;
- }
-
- /*
- * Refill the state structure with buffers (the prior
- * calls released our buffers) and close out this
- * transaction before proceeding.
- */
- ASSERT(args->rmtblkno == 0);
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- dac->dela_state = XFS_DAS_RM_NAME;
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- fallthrough;
- case XFS_DAS_RM_NAME:
- /*
- * If we came here fresh from a transaction roll, reattach all
- * the buffers to the current transaction.
- */
- if (dac->dela_state == XFS_DAS_RM_NAME) {
- error = xfs_attr_refillstate(state);
- if (error)
- goto out;
- }
-
- retval = xfs_attr_node_removename(args, state);
-
- /*
- * Check to see if the tree needs to be collapsed. If so, roll
- * the transacton and fall into the shrink state.
- */
- if (retval && (state->path.active > 1)) {
- error = xfs_da3_join(state);
- if (error)
- goto out;
-
- dac->flags |= XFS_DAC_DEFER_FINISH;
- dac->dela_state = XFS_DAS_RM_SHRINK;
- trace_xfs_attr_remove_iter_return(
- dac->dela_state, args->dp);
- return -EAGAIN;
- }
-
- fallthrough;
- case XFS_DAS_RM_SHRINK:
- /*
- * If the result is small enough, push it all into the inode.
- * This is our final state so it's safe to return a dirty
- * transaction.
- */
- if (xfs_attr_is_leaf(dp))
- error = xfs_attr_node_shrink(args, state);
- ASSERT(error != -EAGAIN);
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
- goto out;
- }
-out:
- if (state)
- xfs_da_state_free(state);
- return error;
-}
-
-/*
- * Fill in the disk block numbers in the state structure for the buffers
- * that are attached to the state structure.
- * This is done so that we can quickly reattach ourselves to those buffers
- * after some set of transaction commits have released these buffers.
- */
-STATIC int
-xfs_attr_fillstate(xfs_da_state_t *state)
-{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level;
-
- trace_xfs_attr_fillstate(state->args);
-
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = xfs_buf_daddr(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
-
- /*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
- */
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->bp) {
- blk->disk_blkno = xfs_buf_daddr(blk->bp);
- blk->bp = NULL;
- } else {
- blk->disk_blkno = 0;
- }
- }
-
- return 0;
-}
-
-/*
- * Reattach the buffers to the state structure based on the disk block
- * numbers stored in the state structure.
- * This is done after some set of transaction commits have released those
- * buffers from our grip.
- */
-STATIC int
-xfs_attr_refillstate(xfs_da_state_t *state)
-{
- xfs_da_state_path_t *path;
- xfs_da_state_blk_t *blk;
- int level, error;
-
- trace_xfs_attr_refillstate(state->args);
-
- /*
- * Roll down the "path" in the state structure, storing the on-disk
- * block number for those buffers in the "path".
- */
- path = &state->path;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
- }
-
- /*
- * Roll down the "altpath" in the state structure, storing the on-disk
- * block number for those buffers in the "altpath".
- */
- path = &state->altpath;
- ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
- for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
- if (blk->disk_blkno) {
- error = xfs_da3_node_read_mapped(state->args->trans,
- state->args->dp, blk->disk_blkno,
- &blk->bp, XFS_ATTR_FORK);
- if (error)
- return error;
- } else {
- blk->bp = NULL;
- }
- }
-
- return 0;
-}
-
-/*
* Retrieve the attribute data from a node attribute list.
*
* This routine gets called for any attribute fork that has more than one
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 5e71f719bdd5..1af7abe29eef 100644
--- a/fs/xfs/libxfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -28,6 +28,15 @@ struct xfs_attr_list_context;
*/
#define ATTR_MAX_VALUELEN (64*1024) /* max length of a value */
+static inline bool xfs_has_larp(struct xfs_mount *mp)
+{
+#ifdef DEBUG
+ return xfs_globals.larp;
+#else
+ return false;
+#endif
+}
+
/*
* Kernel-internal version of the attrlist cursor.
*/
@@ -425,7 +434,7 @@ struct xfs_attr_list_context {
*/
/*
- * Enum values for xfs_delattr_context.da_state
+ * Enum values for xfs_attr_item.xattri_da_state
*
* These values are used by delayed attribute operations to keep track of where
* they were before they returned -EAGAIN. A return code of -EAGAIN signals the
@@ -434,46 +443,105 @@ struct xfs_attr_list_context {
* to where it was and resume executing where it left off.
*/
enum xfs_delattr_state {
- XFS_DAS_UNINIT = 0, /* No state has been set yet */
- XFS_DAS_RMTBLK, /* Removing remote blks */
- XFS_DAS_RM_NAME, /* Remove attr name */
- XFS_DAS_RM_SHRINK, /* We are shrinking the tree */
- XFS_DAS_FOUND_LBLK, /* We found leaf blk for attr */
- XFS_DAS_FOUND_NBLK, /* We found node blk for attr */
- XFS_DAS_FLIP_LFLAG, /* Flipped leaf INCOMPLETE attr flag */
- XFS_DAS_RM_LBLK, /* A rename is removing leaf blocks */
- XFS_DAS_RD_LEAF, /* Read in the new leaf */
- XFS_DAS_ALLOC_NODE, /* We are allocating node blocks */
- XFS_DAS_FLIP_NFLAG, /* Flipped node INCOMPLETE attr flag */
- XFS_DAS_RM_NBLK, /* A rename is removing node blocks */
- XFS_DAS_CLR_FLAG, /* Clear incomplete flag */
+ XFS_DAS_UNINIT = 0, /* No state has been set yet */
+
+ /*
+ * Initial sequence states. The replace setup code relies on the
+ * ADD and REMOVE states for a specific format to be sequential so
+ * that we can transform the initial operation to be performed
+ * according to the xfs_has_larp() state easily.
+ */
+ XFS_DAS_SF_ADD, /* Initial sf add state */
+ XFS_DAS_SF_REMOVE, /* Initial sf replace/remove state */
+
+ XFS_DAS_LEAF_ADD, /* Initial leaf add state */
+ XFS_DAS_LEAF_REMOVE, /* Initial leaf replace/remove state */
+
+ XFS_DAS_NODE_ADD, /* Initial node add state */
+ XFS_DAS_NODE_REMOVE, /* Initial node replace/remove state */
+
+ /* Leaf state set/replace/remove sequence */
+ XFS_DAS_LEAF_SET_RMT, /* set a remote xattr from a leaf */
+ XFS_DAS_LEAF_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_LEAF_REPLACE, /* Perform replace ops on a leaf */
+ XFS_DAS_LEAF_REMOVE_OLD, /* Start removing old attr from leaf */
+ XFS_DAS_LEAF_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_LEAF_REMOVE_ATTR, /* Remove the old attr from a leaf */
+
+ /* Node state sequence, must match leaf state above */
+ XFS_DAS_NODE_SET_RMT, /* set a remote xattr from a node */
+ XFS_DAS_NODE_ALLOC_RMT, /* We are allocating remote blocks */
+ XFS_DAS_NODE_REPLACE, /* Perform replace ops on a node */
+ XFS_DAS_NODE_REMOVE_OLD, /* Start removing old attr from node */
+ XFS_DAS_NODE_REMOVE_RMT, /* A rename is removing remote blocks */
+ XFS_DAS_NODE_REMOVE_ATTR, /* Remove the old attr from a node */
+
+ XFS_DAS_DONE, /* finished operation */
};
+#define XFS_DAS_STRINGS \
+ { XFS_DAS_UNINIT, "XFS_DAS_UNINIT" }, \
+ { XFS_DAS_SF_ADD, "XFS_DAS_SF_ADD" }, \
+ { XFS_DAS_SF_REMOVE, "XFS_DAS_SF_REMOVE" }, \
+ { XFS_DAS_LEAF_ADD, "XFS_DAS_LEAF_ADD" }, \
+ { XFS_DAS_LEAF_REMOVE, "XFS_DAS_LEAF_REMOVE" }, \
+ { XFS_DAS_NODE_ADD, "XFS_DAS_NODE_ADD" }, \
+ { XFS_DAS_NODE_REMOVE, "XFS_DAS_NODE_REMOVE" }, \
+ { XFS_DAS_LEAF_SET_RMT, "XFS_DAS_LEAF_SET_RMT" }, \
+ { XFS_DAS_LEAF_ALLOC_RMT, "XFS_DAS_LEAF_ALLOC_RMT" }, \
+ { XFS_DAS_LEAF_REPLACE, "XFS_DAS_LEAF_REPLACE" }, \
+ { XFS_DAS_LEAF_REMOVE_OLD, "XFS_DAS_LEAF_REMOVE_OLD" }, \
+ { XFS_DAS_LEAF_REMOVE_RMT, "XFS_DAS_LEAF_REMOVE_RMT" }, \
+ { XFS_DAS_LEAF_REMOVE_ATTR, "XFS_DAS_LEAF_REMOVE_ATTR" }, \
+ { XFS_DAS_NODE_SET_RMT, "XFS_DAS_NODE_SET_RMT" }, \
+ { XFS_DAS_NODE_ALLOC_RMT, "XFS_DAS_NODE_ALLOC_RMT" }, \
+ { XFS_DAS_NODE_REPLACE, "XFS_DAS_NODE_REPLACE" }, \
+ { XFS_DAS_NODE_REMOVE_OLD, "XFS_DAS_NODE_REMOVE_OLD" }, \
+ { XFS_DAS_NODE_REMOVE_RMT, "XFS_DAS_NODE_REMOVE_RMT" }, \
+ { XFS_DAS_NODE_REMOVE_ATTR, "XFS_DAS_NODE_REMOVE_ATTR" }, \
+ { XFS_DAS_DONE, "XFS_DAS_DONE" }
+
/*
- * Defines for xfs_delattr_context.flags
+ * Defines for xfs_attr_item.xattri_flags
*/
-#define XFS_DAC_DEFER_FINISH 0x01 /* finish the transaction */
-#define XFS_DAC_LEAF_ADDNAME_INIT 0x02 /* xfs_attr_leaf_addname init*/
+#define XFS_DAC_LEAF_ADDNAME_INIT 0x01 /* xfs_attr_leaf_addname init*/
/*
* Context used for keeping track of delayed attribute operations
*/
-struct xfs_delattr_context {
- struct xfs_da_args *da_args;
+struct xfs_attr_item {
+ struct xfs_da_args *xattri_da_args;
+
+ /*
+ * Used by xfs_attr_set to hold a leaf buffer across a transaction roll
+ */
+ struct xfs_buf *xattri_leaf_bp;
/* Used in xfs_attr_rmtval_set_blk to roll through allocating blocks */
- struct xfs_bmbt_irec map;
- xfs_dablk_t lblkno;
- int blkcnt;
+ struct xfs_bmbt_irec xattri_map;
+ xfs_dablk_t xattri_lblkno;
+ int xattri_blkcnt;
/* Used in xfs_attr_node_removename to roll through removing blocks */
- struct xfs_da_state *da_state;
+ struct xfs_da_state *xattri_da_state;
/* Used to keep track of current state of delayed operation */
- unsigned int flags;
- enum xfs_delattr_state dela_state;
+ unsigned int xattri_flags;
+ enum xfs_delattr_state xattri_dela_state;
+
+ /*
+ * Attr operation being performed - XFS_ATTR_OP_FLAGS_*
+ */
+ unsigned int xattri_op_flags;
+
+ /*
+ * used to log this item to an intent containing a list of attrs to
+ * commit later
+ */
+ struct list_head xattri_list;
};
+
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
@@ -489,11 +557,81 @@ bool xfs_attr_is_leaf(struct xfs_inode *ip);
int xfs_attr_get_ilocked(struct xfs_da_args *args);
int xfs_attr_get(struct xfs_da_args *args);
int xfs_attr_set(struct xfs_da_args *args);
-int xfs_attr_set_args(struct xfs_da_args *args);
-int xfs_attr_remove_args(struct xfs_da_args *args);
-int xfs_attr_remove_iter(struct xfs_delattr_context *dac);
+int xfs_attr_set_iter(struct xfs_attr_item *attr);
+int xfs_attr_remove_iter(struct xfs_attr_item *attr);
bool xfs_attr_namecheck(const void *name, size_t length);
-void xfs_delattr_context_init(struct xfs_delattr_context *dac,
- struct xfs_da_args *args);
+int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
+void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
+ unsigned int *total);
+
+extern struct kmem_cache *xfs_attri_cache;
+extern struct kmem_cache *xfs_attrd_cache;
+
+int __init xfs_attri_init_cache(void);
+void xfs_attri_destroy_cache(void);
+int __init xfs_attrd_init_cache(void);
+void xfs_attrd_destroy_cache(void);
+
+/*
+ * Check to see if the attr should be upgraded from non-existent or shortform to
+ * single-leaf-block attribute list.
+ */
+static inline bool
+xfs_attr_is_shortform(
+ struct xfs_inode *ip)
+{
+ return ip->i_afp->if_format == XFS_DINODE_FMT_LOCAL ||
+ (ip->i_afp->if_format == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_afp->if_nextents == 0);
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_add_state(struct xfs_da_args *args)
+{
+ /*
+ * When called from the completion of a attr remove to determine the
+ * next state, the attribute fork may be null. This can occur only occur
+ * on a pure remove, but we grab the next state before we check if a
+ * replace operation is being performed. If we are called from any other
+ * context, i_afp is guaranteed to exist. Hence if the attr fork is
+ * null, we were called from a pure remove operation and so we are done.
+ */
+ if (!args->dp->i_afp)
+ return XFS_DAS_DONE;
+
+ args->op_flags |= XFS_DA_OP_ADDNAME;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_ADD;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_ADD;
+ return XFS_DAS_NODE_ADD;
+}
+
+static inline enum xfs_delattr_state
+xfs_attr_init_remove_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_REMOVE;
+ if (xfs_attr_is_shortform(args->dp))
+ return XFS_DAS_SF_REMOVE;
+ if (xfs_attr_is_leaf(args->dp))
+ return XFS_DAS_LEAF_REMOVE;
+ return XFS_DAS_NODE_REMOVE;
+}
+
+/*
+ * If we are logging the attributes, then we have to start with removal of the
+ * old attribute so that there is always consistent state that we can recover
+ * from if the system goes down part way through. We always log the new attr
+ * value, so even when we remove the attr first we still have the information in
+ * the log to finish the replace operation atomically.
+ */
+static inline enum xfs_delattr_state
+xfs_attr_init_replace_state(struct xfs_da_args *args)
+{
+ args->op_flags |= XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE;
+ if (xfs_has_larp(args->dp->i_mount))
+ return xfs_attr_init_remove_state(args);
+ return xfs_attr_init_add_state(args);
+}
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 014daa8c542d..15a990409463 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -28,6 +28,7 @@
#include "xfs_dir2.h"
#include "xfs_log.h"
#include "xfs_ag.h"
+#include "xfs_errortag.h"
/*
@@ -310,6 +311,15 @@ xfs_attr3_leaf_verify(
return fa;
/*
+ * Empty leaf blocks should never occur; they imply the existence of a
+ * software bug that needs fixing. xfs_repair also flags them as a
+ * corruption that needs fixing, so we should never let these go to
+ * disk.
+ */
+ if (ichdr.count == 0)
+ return __this_address;
+
+ /*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
*/
@@ -445,6 +455,14 @@ xfs_attr3_leaf_read(
* Namespace helper routines
*========================================================================*/
+/*
+ * If we are in log recovery, then we want the lookup to ignore the INCOMPLETE
+ * flag on disk - if there's an incomplete attr then recovery needs to tear it
+ * down. If there's no incomplete attr, then recovery needs to tear that attr
+ * down to replace it with the attr that has been logged. In this case, the
+ * INCOMPLETE flag will not be set in attr->attr_filter, but rather
+ * XFS_DA_OP_RECOVERY will be set in args->op_flags.
+ */
static bool
xfs_attr_match(
struct xfs_da_args *args,
@@ -452,14 +470,18 @@ xfs_attr_match(
unsigned char *name,
int flags)
{
+
if (args->namelen != namelen)
return false;
if (memcmp(args->name, name, namelen) != 0)
return false;
- /*
- * If we are looking for incomplete entries, show only those, else only
- * show complete entries.
- */
+
+ /* Recovery ignores the INCOMPLETE flag. */
+ if ((args->op_flags & XFS_DA_OP_RECOVERY) &&
+ args->attr_filter == (flags & XFS_ATTR_NSP_ONDISK_MASK))
+ return true;
+
+ /* All remaining matches need to be filtered by INCOMPLETE state. */
if (args->attr_filter !=
(flags & (XFS_ATTR_NSP_ONDISK_MASK | XFS_ATTR_INCOMPLETE)))
return false;
@@ -798,6 +820,14 @@ xfs_attr_sf_removename(
sf = (struct xfs_attr_shortform *)dp->i_afp->if_u1.if_data;
error = xfs_attr_sf_findname(args, &sfe, &base);
+
+ /*
+ * If we are recovering an operation, finding nothing to
+ * remove is not an error - it just means there was nothing
+ * to clean up.
+ */
+ if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
+ return 0;
if (error != -EEXIST)
return error;
size = xfs_attr_sf_entsize(sfe);
@@ -818,7 +848,7 @@ xfs_attr_sf_removename(
totsize -= size;
if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
- !(args->op_flags & XFS_DA_OP_ADDNAME)) {
+ !(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
xfs_attr_fork_remove(dp, args->trans);
} else {
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
@@ -1127,9 +1157,17 @@ xfs_attr3_leaf_to_shortform(
goto out;
if (forkoff == -1) {
- ASSERT(xfs_has_attr2(dp->i_mount));
- ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
- xfs_attr_fork_remove(dp, args->trans);
+ /*
+ * Don't remove the attr fork if this operation is the first
+ * part of a attr replace operations. We're going to add a new
+ * attr immediately, so we need to keep the attr fork around in
+ * this case.
+ */
+ if (!(args->op_flags & XFS_DA_OP_REPLACE)) {
+ ASSERT(xfs_has_attr2(dp->i_mount));
+ ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE);
+ xfs_attr_fork_remove(dp, args->trans);
+ }
goto out;
}
@@ -1189,6 +1227,11 @@ xfs_attr3_leaf_to_node(
trace_xfs_attr_leaf_to_node(args);
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_ATTR_LEAF_TO_NODE)) {
+ error = -EIO;
+ goto out;
+ }
+
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
@@ -1486,8 +1529,9 @@ xfs_attr3_leaf_add_work(
entry->flags = args->attr_filter;
if (tmp)
entry->flags |= XFS_ATTR_LOCAL;
- if (args->op_flags & XFS_DA_OP_RENAME) {
- entry->flags |= XFS_ATTR_INCOMPLETE;
+ if (args->op_flags & XFS_DA_OP_REPLACE) {
+ if (!xfs_has_larp(mp))
+ entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
(args->index2 <= args->index)) {
args->index2++;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 83b95be9ded8..4250159ecced 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -568,14 +568,14 @@ xfs_attr_rmtval_stale(
*/
int
xfs_attr_rmtval_find_space(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
- struct xfs_bmbt_irec *map = &dac->map;
+ struct xfs_da_args *args = attr->xattri_da_args;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
int error;
- dac->lblkno = 0;
- dac->blkcnt = 0;
+ attr->xattri_lblkno = 0;
+ attr->xattri_blkcnt = 0;
args->rmtblkcnt = 0;
args->rmtblkno = 0;
memset(map, 0, sizeof(struct xfs_bmbt_irec));
@@ -584,8 +584,8 @@ xfs_attr_rmtval_find_space(
if (error)
return error;
- dac->blkcnt = args->rmtblkcnt;
- dac->lblkno = args->rmtblkno;
+ attr->xattri_blkcnt = args->rmtblkcnt;
+ attr->xattri_lblkno = args->rmtblkno;
return 0;
}
@@ -598,17 +598,18 @@ xfs_attr_rmtval_find_space(
*/
int
xfs_attr_rmtval_set_blk(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
struct xfs_inode *dp = args->dp;
- struct xfs_bmbt_irec *map = &dac->map;
+ struct xfs_bmbt_irec *map = &attr->xattri_map;
int nmap;
int error;
nmap = 1;
- error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)dac->lblkno,
- dac->blkcnt, XFS_BMAPI_ATTRFORK, args->total,
+ error = xfs_bmapi_write(args->trans, dp,
+ (xfs_fileoff_t)attr->xattri_lblkno,
+ attr->xattri_blkcnt, XFS_BMAPI_ATTRFORK, args->total,
map, &nmap);
if (error)
return error;
@@ -618,8 +619,8 @@ xfs_attr_rmtval_set_blk(
(map->br_startblock != HOLESTARTBLOCK));
/* roll attribute extent map forwards */
- dac->lblkno += map->br_blockcount;
- dac->blkcnt -= map->br_blockcount;
+ attr->xattri_lblkno += map->br_blockcount;
+ attr->xattri_blkcnt -= map->br_blockcount;
return 0;
}
@@ -673,9 +674,9 @@ xfs_attr_rmtval_invalidate(
*/
int
xfs_attr_rmtval_remove(
- struct xfs_delattr_context *dac)
+ struct xfs_attr_item *attr)
{
- struct xfs_da_args *args = dac->da_args;
+ struct xfs_da_args *args = attr->xattri_da_args;
int error, done;
/*
@@ -695,8 +696,8 @@ xfs_attr_rmtval_remove(
* the parent
*/
if (!done) {
- dac->flags |= XFS_DAC_DEFER_FINISH;
- trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp);
+ trace_xfs_attr_rmtval_remove_return(attr->xattri_dela_state,
+ args->dp);
return -EAGAIN;
}
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index d72eff30ca18..62b398edec3f 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -12,9 +12,9 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
xfs_buf_flags_t incore_flags);
int xfs_attr_rmtval_invalidate(struct xfs_da_args *args);
-int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_remove(struct xfs_attr_item *attr);
int xfs_attr_rmt_find_hole(struct xfs_da_args *args);
int xfs_attr_rmtval_set_value(struct xfs_da_args *args);
-int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac);
-int xfs_attr_rmtval_find_space(struct xfs_delattr_context *dac);
+int xfs_attr_rmtval_set_blk(struct xfs_attr_item *attr);
+int xfs_attr_rmtval_find_space(struct xfs_attr_item *attr);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 74198dd82b03..6833110d1bd4 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels(
xfs_mount_t *mp, /* file system mount structure */
int whichfork) /* data or attr fork */
{
+ uint64_t maxblocks; /* max blocks at this level */
+ xfs_extnum_t maxleafents; /* max leaf entries possible */
int level; /* btree level */
- uint maxblocks; /* max blocks at this level */
- uint maxleafents; /* max leaf entries possible */
int maxrootrecs; /* max records in root block */
int minleafrecs; /* min records in leaf block */
int minnoderecs; /* min records in node block */
int sz; /* root block size */
/*
- * The maximum number of extents in a file, hence the maximum number of
- * leaf entries, is controlled by the size of the on-disk extent count,
- * either a signed 32-bit number for the data fork, or a signed 16-bit
- * number for the attr fork.
+ * The maximum number of extents in a fork, hence the maximum number of
+ * leaf entries, is controlled by the size of the on-disk extent count.
*
* Note that we can no longer assume that if we are in ATTR1 that the
* fork offset of all the inodes will be
@@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels(
* ATTR2 we have to assume the worst case scenario of a minimum size
* available.
*/
- if (whichfork == XFS_DATA_FORK) {
- maxleafents = MAXEXTNUM;
+ maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
+ whichfork);
+ if (whichfork == XFS_DATA_FORK)
sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
- } else {
- maxleafents = MAXAEXTNUM;
+ else
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
- }
+
maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
- maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
+ maxblocks = howmany_64(maxleafents, minleafrecs);
for (level = 1; maxblocks > 1; level++) {
if (maxblocks <= maxrootrecs)
maxblocks = 1;
else
- maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
+ maxblocks = howmany_64(maxblocks, minnoderecs);
}
mp->m_bm_maxlevels[whichfork] = level;
ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
@@ -468,7 +466,7 @@ error0:
if (bp_release)
xfs_trans_brelse(NULL, bp);
error_norelse:
- xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
+ xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
__func__, i);
xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -485,7 +483,7 @@ STATIC void
xfs_bmap_validate_ret(
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_bmbt_irec_t *mval,
int nmap,
int ret_nmap)
@@ -1399,7 +1397,7 @@ xfs_bmap_add_extent_delay_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t da_new; /* new count del alloc blocks used */
xfs_filblks_t da_old; /* old count del alloc blocks used */
xfs_filblks_t temp=0; /* value for da_new calculations */
@@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -1950,7 +1948,7 @@ xfs_bmap_add_extent_unwritten_real(
xfs_bmbt_irec_t r[3]; /* neighbor extent entries */
/* left is 0, right is 1, prev is 2 */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_mount *mp = ip->i_mount;
struct xfs_bmbt_irec old;
@@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real(
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
LEFT.br_state == new->br_state &&
- LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
/*
@@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real(
new_endoff == RIGHT.br_startoff &&
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
new->br_state == RIGHT.br_state &&
- new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING)) !=
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
BMAP_RIGHT_FILLING) ||
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
- <= MAXEXTLEN))
+ <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2479,7 +2477,7 @@ xfs_bmap_add_extent_hole_delay(
xfs_filblks_t newlen=0; /* new indirect size */
xfs_filblks_t oldlen=0; /* old indirect size */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
xfs_filblks_t temp; /* temp for indirect calculations */
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay(
*/
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
(left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN)))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
state |= BMAP_RIGHT_CONTIG;
/*
@@ -2616,7 +2614,7 @@ xfs_bmap_add_extent_hole_real(
struct xfs_btree_cur **curp,
struct xfs_bmbt_irec *new,
int *logflagsp,
- int flags)
+ uint32_t flags)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
@@ -2626,7 +2624,7 @@ xfs_bmap_add_extent_hole_real(
xfs_bmbt_irec_t left; /* left neighbor extent entry */
xfs_bmbt_irec_t right; /* right neighbor extent entry */
int rval=0; /* return value (logging flags) */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
ASSERT(!isnullstartblock(new->br_startblock));
@@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real(
left.br_startoff + left.br_blockcount == new->br_startoff &&
left.br_startblock + left.br_blockcount == new->br_startblock &&
left.br_state == new->br_state &&
- left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
state |= BMAP_LEFT_CONTIG;
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
new->br_startoff + new->br_blockcount == right.br_startoff &&
new->br_startblock + new->br_blockcount == right.br_startblock &&
new->br_state == right.br_state &&
- new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
(!(state & BMAP_LEFT_CONTIG) ||
left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= MAXEXTLEN))
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
state |= BMAP_RIGHT_CONTIG;
error = 0;
@@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align(
/*
* For large extent hint sizes, the aligned extent might be larger than
- * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
- * the length back under MAXEXTLEN. The outer allocation loops handle
- * short allocation just fine, so it is safe to do this. We only want to
- * do it when we are forced to, though, because it means more allocation
- * operations are required.
+ * XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
+ * that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
+ * allocation loops handle short allocation just fine, so it is safe to
+ * do this. We only want to do it when we are forced to, though, because
+ * it means more allocation operations are required.
*/
- while (align_alen > MAXEXTLEN)
+ while (align_alen > XFS_MAX_BMBT_EXTLEN)
align_alen -= extsz;
- ASSERT(align_alen <= MAXEXTLEN);
+ ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
/*
* If the previous block overlaps with this proposed allocation
@@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align(
return -EINVAL;
} else {
ASSERT(orig_off >= align_off);
- /* see MAXEXTLEN handling above */
+ /* see XFS_BMBT_MAX_EXTLEN handling above */
ASSERT(orig_end <= align_off + align_alen ||
- align_alen + extsz > MAXEXTLEN);
+ align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
}
#ifdef DEBUG
@@ -3766,7 +3764,7 @@ xfs_bmapi_trim_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int n,
- int flags)
+ uint32_t flags)
{
if ((flags & XFS_BMAPI_ENTIRE) ||
got->br_startoff + got->br_blockcount <= obno) {
@@ -3811,7 +3809,7 @@ xfs_bmapi_update_map(
xfs_fileoff_t obno,
xfs_fileoff_t end,
int *n,
- int flags)
+ uint32_t flags)
{
xfs_bmbt_irec_t *mval = *map;
@@ -3864,7 +3862,7 @@ xfs_bmapi_read(
xfs_filblks_t len,
struct xfs_bmbt_irec *mval,
int *nmap,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
int whichfork = xfs_bmapi_whichfork(flags);
@@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
* Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return.
*/
- alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len)
@@ -4104,7 +4102,7 @@ xfs_bmapi_allocate(
if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
bma->prev.br_startoff = NULLFILEOFF;
} else {
- bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
+ bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
if (!bma->eof)
bma->length = XFS_FILBLKS_MIN(bma->length,
bma->got.br_startoff - bma->offset);
@@ -4184,7 +4182,7 @@ xfs_bmapi_convert_unwritten(
struct xfs_bmalloca *bma,
struct xfs_bmbt_irec *mval,
xfs_filblks_t len,
- int flags)
+ uint32_t flags)
{
int whichfork = xfs_bmapi_whichfork(flags);
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
@@ -4312,7 +4310,7 @@ xfs_bmapi_write(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t bno, /* starting file offs. mapped */
xfs_filblks_t len, /* length to map in file */
- int flags, /* XFS_BMAPI_... */
+ uint32_t flags, /* XFS_BMAPI_... */
xfs_extlen_t total, /* total blocks needed */
struct xfs_bmbt_irec *mval, /* output: map values */
int *nmap) /* i/o: mval size/count */
@@ -4424,8 +4422,8 @@ xfs_bmapi_write(
* xfs_extlen_t and therefore 32 bits. Hence we have to
* check for 32-bit overflows and handle them here.
*/
- if (len > (xfs_filblks_t)MAXEXTLEN)
- bma.length = MAXEXTLEN;
+ if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
+ bma.length = XFS_MAX_BMBT_EXTLEN;
else
bma.length = len;
@@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc(
return error;
xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
error = xfs_iext_count_may_overflow(ip, whichfork,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, 0);
-
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
bma.got.br_startoff > offset_fsb) {
/*
@@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc(
bma.ip = ip;
bma.wasdel = true;
bma.offset = bma.got.br_startoff;
- bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
+ bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
+ XFS_MAX_BMBT_EXTLEN);
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
/*
@@ -4629,7 +4630,7 @@ xfs_bmapi_remap(
xfs_fileoff_t bno,
xfs_filblks_t len,
xfs_fsblock_t startblock,
- int flags)
+ uint32_t flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp;
@@ -4641,7 +4642,7 @@ xfs_bmapi_remap(
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(len > 0);
- ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+ ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
XFS_BMAPI_NORMAP)));
@@ -4801,7 +4802,7 @@ xfs_bmap_del_extent_delay(
int64_t da_old, da_new, da_diff = 0;
xfs_fileoff_t del_endoff, got_endoff;
xfs_filblks_t got_indlen, new_indlen, stolen;
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
int error = 0;
bool isrt;
@@ -4926,7 +4927,7 @@ xfs_bmap_del_extent_cow(
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec new;
xfs_fileoff_t del_endoff, got_endoff;
- int state = BMAP_COWFORK;
+ uint32_t state = BMAP_COWFORK;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -4999,7 +5000,7 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t *del, /* data to remove from extents */
int *logflagsp, /* inode logging flags */
int whichfork, /* data or attr fork */
- int bflags) /* bmapi flags */
+ uint32_t bflags) /* bmapi flags */
{
xfs_fsblock_t del_endblock=0; /* first block past del */
xfs_fileoff_t del_endoff; /* first offset past del */
@@ -5015,7 +5016,7 @@ xfs_bmap_del_extent_real(
xfs_bmbt_irec_t new; /* new record to be inserted */
/* REFERENCED */
uint qfield; /* quota field to update */
- int state = xfs_bmap_fork_to_state(whichfork);
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
mp = ip->i_mount;
@@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real(
* Deleting the middle of the extent.
*/
- /*
- * For directories, -ENOSPC is returned since a directory entry
- * remove operation must not fail due to low extent count
- * availability. -ENOSPC will be handled by higher layers of XFS
- * by letting the corresponding empty Data/Free blocks to linger
- * until a future remove operation. Dabtree blocks would be
- * swapped with the last block in the leaf space and then the
- * new last block will be unmapped.
- *
- * The above logic also applies to the source directory entry of
- * a rename operation.
- */
- error = xfs_iext_count_may_overflow(ip, whichfork, 1);
- if (error) {
- ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
- whichfork == XFS_DATA_FORK);
- error = -ENOSPC;
- goto done;
- }
-
old = got;
got.br_blockcount = del->br_startoff - got.br_startoff;
@@ -5281,7 +5262,7 @@ __xfs_bunmapi(
struct xfs_inode *ip, /* incore inode */
xfs_fileoff_t start, /* first file offset deleted */
xfs_filblks_t *rlen, /* i/o: amount remaining */
- int flags, /* misc flags */
+ uint32_t flags, /* misc flags */
xfs_extnum_t nexts) /* number of extents max */
{
struct xfs_btree_cur *cur; /* bmap btree cursor */
@@ -5299,7 +5280,6 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
- xfs_fileoff_t max_len;
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
bool done = false;
@@ -5318,16 +5298,6 @@ __xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
- /*
- * Guesstimate how many blocks we can unmap without running the risk of
- * blowing out the transaction with a mix of EFIs and reflink
- * adjustments.
- */
- if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
- max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
- else
- max_len = len;
-
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
@@ -5366,7 +5336,7 @@ __xfs_bunmapi(
extno = 0;
while (end != (xfs_fileoff_t)-1 && end >= start &&
- (nexts == 0 || extno < nexts) && max_len > 0) {
+ (nexts == 0 || extno < nexts)) {
/*
* Is the found extent after a hole in which end lives?
* Just back up to the previous extent, if so.
@@ -5400,14 +5370,6 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
- /* How much can we safely unmap? */
- if (max_len < del.br_blockcount) {
- del.br_startoff += del.br_blockcount - max_len;
- if (!wasdel)
- del.br_startblock += del.br_blockcount - max_len;
- del.br_blockcount = max_len;
- }
-
if (!isrt)
goto delete;
@@ -5543,7 +5505,6 @@ delete:
if (error)
goto error0;
- max_len -= del.br_blockcount;
end = del.br_startoff - 1;
nodelete:
/*
@@ -5609,7 +5570,7 @@ xfs_bunmapi(
struct xfs_inode *ip,
xfs_fileoff_t bno,
xfs_filblks_t len,
- int flags,
+ uint32_t flags,
xfs_extnum_t nexts,
int *done)
{
@@ -5641,7 +5602,7 @@ xfs_bmse_can_merge(
if ((left->br_startoff + left->br_blockcount != startoff) ||
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
(left->br_state != got->br_state) ||
- (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
+ (left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
return false;
return true;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 03d9aaf87413..16db95b11589 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -39,7 +39,7 @@ struct xfs_bmalloca {
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
int datatype;/* data type being allocated */
- int flags;
+ uint32_t flags;
};
#define XFS_BMAP_MAX_NMAP 4
@@ -47,17 +47,17 @@ struct xfs_bmalloca {
/*
* Flags for xfs_bmapi_*
*/
-#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */
-#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */
-#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */
-#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */
-#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */
+#define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */
+#define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */
+#define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */
+#define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */
+#define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */
/*
* unwritten extent conversion - this needs write cache flushing and no additional
* allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
* from written to unwritten, otherwise convert from unwritten to written.
*/
-#define XFS_BMAPI_CONVERT 0x040
+#define XFS_BMAPI_CONVERT (1u << 5)
/*
* allocate zeroed extents - this requires all newly allocated user data extents
@@ -65,7 +65,7 @@ struct xfs_bmalloca {
* Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
* during the allocation range to zeroed written extents.
*/
-#define XFS_BMAPI_ZERO 0x080
+#define XFS_BMAPI_ZERO (1u << 6)
/*
* Map the inode offset to the block given in ap->firstblock. Primarily
@@ -75,16 +75,16 @@ struct xfs_bmalloca {
* For bunmapi, this flag unmaps the range without adjusting quota, reducing
* refcount, or freeing the blocks.
*/
-#define XFS_BMAPI_REMAP 0x100
+#define XFS_BMAPI_REMAP (1u << 7)
/* Map something in the CoW fork. */
-#define XFS_BMAPI_COWFORK 0x200
+#define XFS_BMAPI_COWFORK (1u << 8)
/* Skip online discard of freed extents */
-#define XFS_BMAPI_NODISCARD 0x1000
+#define XFS_BMAPI_NODISCARD (1u << 9)
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
-#define XFS_BMAPI_NORMAP 0x2000
+#define XFS_BMAPI_NORMAP (1u << 10)
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -106,7 +106,7 @@ static inline int xfs_bmapi_aflag(int w)
(w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
}
-static inline int xfs_bmapi_whichfork(int bmapi_flags)
+static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
{
if (bmapi_flags & XFS_BMAPI_COWFORK)
return XFS_COW_FORK;
@@ -124,16 +124,16 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
/*
* Flags for xfs_bmap_add_extent*.
*/
-#define BMAP_LEFT_CONTIG (1 << 0)
-#define BMAP_RIGHT_CONTIG (1 << 1)
-#define BMAP_LEFT_FILLING (1 << 2)
-#define BMAP_RIGHT_FILLING (1 << 3)
-#define BMAP_LEFT_DELAY (1 << 4)
-#define BMAP_RIGHT_DELAY (1 << 5)
-#define BMAP_LEFT_VALID (1 << 6)
-#define BMAP_RIGHT_VALID (1 << 7)
-#define BMAP_ATTRFORK (1 << 8)
-#define BMAP_COWFORK (1 << 9)
+#define BMAP_LEFT_CONTIG (1u << 0)
+#define BMAP_RIGHT_CONTIG (1u << 1)
+#define BMAP_LEFT_FILLING (1u << 2)
+#define BMAP_RIGHT_FILLING (1u << 3)
+#define BMAP_LEFT_DELAY (1u << 4)
+#define BMAP_RIGHT_DELAY (1u << 5)
+#define BMAP_LEFT_VALID (1u << 6)
+#define BMAP_RIGHT_VALID (1u << 7)
+#define BMAP_ATTRFORK (1u << 8)
+#define BMAP_COWFORK (1u << 9)
#define XFS_BMAP_EXT_FLAGS \
{ BMAP_LEFT_CONTIG, "LC" }, \
@@ -183,15 +183,15 @@ int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
int whichfork);
int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
xfs_filblks_t len, struct xfs_bmbt_irec *mval,
- int *nmap, int flags);
+ int *nmap, uint32_t flags);
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
xfs_extnum_t nexts);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t len, int flags,
+ xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
@@ -243,7 +243,7 @@ void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
struct xfs_bmbt_irec *imap);
-static inline int xfs_bmap_fork_to_state(int whichfork)
+static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
{
switch (whichfork) {
case XFS_ATTR_FORK:
@@ -260,7 +260,7 @@ xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
- int flags);
+ uint32_t flags);
extern struct kmem_cache *xfs_bmap_intent_cache;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 453309fc85f2..2b77d45c215f 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
return xfs_bmbt_block_maxrecs(blocklen, leaf);
}
-/* Compute the max possible height for block mapping btrees. */
+/*
+ * Calculate the maximum possible height of the btree that the on-disk format
+ * supports. This is used for sizing structures large enough to support every
+ * possible configuration of a filesystem that might get mounted.
+ */
unsigned int
xfs_bmbt_maxlevels_ondisk(void)
{
@@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
/* One extra level for the inode root. */
- return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
+ return xfs_btree_compute_maxlevels(minrecs,
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c1500b238520..2aa300f7461f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -51,6 +51,52 @@ xfs_btree_magic(
return magic;
}
+static xfs_failaddr_t
+xfs_btree_check_lblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_fsblock_t fsb,
+ xfs_fsblock_t sibling)
+{
+ if (sibling == NULLFSBLOCK)
+ return NULL;
+ if (sibling == fsb)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_lptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_fsbno(mp, sibling))
+ return __this_address;
+ }
+
+ return NULL;
+}
+
+static xfs_failaddr_t
+xfs_btree_check_sblock_siblings(
+ struct xfs_mount *mp,
+ struct xfs_btree_cur *cur,
+ int level,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_agblock_t sibling)
+{
+ if (sibling == NULLAGBLOCK)
+ return NULL;
+ if (sibling == agbno)
+ return __this_address;
+ if (level >= 0) {
+ if (!xfs_btree_check_sptr(cur, sibling, level + 1))
+ return __this_address;
+ } else {
+ if (!xfs_verify_agbno(mp, agno, sibling))
+ return __this_address;
+ }
+ return NULL;
+}
+
/*
* Check a long btree block header. Return the address of the failing check,
* or NULL if everything is ok.
@@ -65,6 +111,8 @@ __xfs_btree_check_lblock(
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_fsblock_t fsb = NULLFSBLOCK;
if (crc) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -83,16 +131,16 @@ __xfs_btree_check_lblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp)
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ be64_to_cpu(block->bb_u.l.bb_leftsib));
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+ be64_to_cpu(block->bb_u.l.bb_rightsib));
+ return fa;
}
/* Check a long btree block header. */
@@ -130,6 +178,9 @@ __xfs_btree_check_sblock(
struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_has_crc(mp);
+ xfs_failaddr_t fa;
+ xfs_agblock_t agbno = NULLAGBLOCK;
+ xfs_agnumber_t agno = NULLAGNUMBER;
if (crc) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
@@ -146,16 +197,18 @@ __xfs_btree_check_sblock(
if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level))
return __this_address;
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
- level + 1))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
- level + 1))
- return __this_address;
- return NULL;
+ if (bp) {
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
+ }
+
+ fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno,
+ be32_to_cpu(block->bb_u.s.bb_leftsib));
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno,
+ agbno, be32_to_cpu(block->bb_u.s.bb_rightsib));
+ return fa;
}
/* Check a short btree block header. */
@@ -751,20 +804,20 @@ xfs_btree_lastrec(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets, /* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
int *last) /* output: last byte offset */
{
int i; /* current bit number */
- int64_t imask; /* mask for current bit number */
+ uint32_t imask; /* mask for current bit number */
ASSERT(fields != 0);
/*
* Find the lowest bit, so the first byte offset.
*/
- for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
+ for (i = 0, imask = 1u; ; i++, imask <<= 1) {
if (imask & fields) {
*first = offsets[i];
break;
@@ -773,7 +826,7 @@ xfs_btree_offsets(
/*
* Find the highest bit, so the last byte offset.
*/
- for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
+ for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
if (imask & fields) {
*last = offsets[i + 1] - 1;
break;
@@ -1456,7 +1509,7 @@ void
xfs_btree_log_block(
struct xfs_btree_cur *cur, /* btree cursor */
struct xfs_buf *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+ uint32_t fields) /* mask of fields: XFS_BB_... */
{
int first; /* first byte offset logged */
int last; /* last byte offset logged */
@@ -4271,6 +4324,21 @@ xfs_btree_visit_block(
if (xfs_btree_ptr_is_null(cur, &rptr))
return -ENOENT;
+ /*
+ * We only visit blocks once in this walk, so we have to avoid the
+ * internal xfs_btree_lookup_get_block() optimisation where it will
+ * return the same block without checking if the right sibling points
+ * back to us and creates a cyclic reference in the btree.
+ */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ } else {
+ if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
+ xfs_buf_daddr(bp)))
+ return -EFSCORRUPTED;
+ }
return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
}
@@ -4445,20 +4513,21 @@ xfs_btree_lblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ xfs_fsblock_t fsb;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
- if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))
- return __this_address;
- if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
- !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ be64_to_cpu(block->bb_u.l.bb_leftsib));
+ if (!fa)
+ fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+ be64_to_cpu(block->bb_u.l.bb_rightsib));
+ return fa;
}
/**
@@ -4499,7 +4568,9 @@ xfs_btree_sblock_verify(
{
struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
- xfs_agblock_t agno;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ xfs_failaddr_t fa;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
@@ -4507,14 +4578,13 @@ xfs_btree_sblock_verify(
/* sibling pointer verification */
agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
- if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib)))
- return __this_address;
- if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
- !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib)))
- return __this_address;
-
- return NULL;
+ agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+ fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+ be32_to_cpu(block->bb_u.s.bb_leftsib));
+ if (!fa)
+ fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
+ be32_to_cpu(block->bb_u.s.bb_rightsib));
+ return fa;
}
/*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 22d9f411fde6..eef27858a013 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -68,19 +68,19 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
/*
* For logging record fields.
*/
-#define XFS_BB_MAGIC (1 << 0)
-#define XFS_BB_LEVEL (1 << 1)
-#define XFS_BB_NUMRECS (1 << 2)
-#define XFS_BB_LEFTSIB (1 << 3)
-#define XFS_BB_RIGHTSIB (1 << 4)
-#define XFS_BB_BLKNO (1 << 5)
-#define XFS_BB_LSN (1 << 6)
-#define XFS_BB_UUID (1 << 7)
-#define XFS_BB_OWNER (1 << 8)
+#define XFS_BB_MAGIC (1u << 0)
+#define XFS_BB_LEVEL (1u << 1)
+#define XFS_BB_NUMRECS (1u << 2)
+#define XFS_BB_LEFTSIB (1u << 3)
+#define XFS_BB_RIGHTSIB (1u << 4)
+#define XFS_BB_BLKNO (1u << 5)
+#define XFS_BB_LSN (1u << 6)
+#define XFS_BB_UUID (1u << 7)
+#define XFS_BB_OWNER (1u << 8)
#define XFS_BB_NUM_BITS 5
-#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
+#define XFS_BB_ALL_BITS ((1u << XFS_BB_NUM_BITS) - 1)
#define XFS_BB_NUM_BITS_CRC 9
-#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1)
+#define XFS_BB_ALL_BITS_CRC ((1u << XFS_BB_NUM_BITS_CRC) - 1)
/*
* Generic stats interface
@@ -345,7 +345,7 @@ xfs_btree_dup_cursor(
*/
void
xfs_btree_offsets(
- int64_t fields, /* bitmask of fields */
+ uint32_t fields, /* bitmask of fields */
const short *offsets,/* table of field offsets */
int nbits, /* number of bits to inspect */
int *first, /* output: first byte offset */
@@ -435,7 +435,7 @@ bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
/*
* Internal btree helpers also used by xfs_bmap.c.
*/
-void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, uint32_t);
void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
/*
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 9dc1ecb9713d..aa74f3fdb571 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -22,6 +22,7 @@
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_errortag.h"
/*
* xfs_da_btree.c
@@ -482,6 +483,9 @@ xfs_da3_split(
trace_xfs_da_split(state->args);
+ if (XFS_TEST_ERROR(false, state->mp, XFS_ERRTAG_DA_LEAF_SPLIT))
+ return -EIO;
+
/*
* Walk back up the tree splitting/inserting/adjusting as necessary.
* If we need to insert and there isn't room, split the node, then
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 0faf7d9ac241..ed2303e4d46a 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -30,6 +30,7 @@ struct xfs_da_geometry {
unsigned int free_hdr_size; /* dir2 free header size */
unsigned int free_max_bests; /* # of bests entries in dir2 free */
xfs_dablk_t freeblk; /* blockno of free data v2 */
+ xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
xfs_dir2_data_aoff_t data_first_offset;
size_t data_entry_offset;
@@ -76,27 +77,31 @@ typedef struct xfs_da_args {
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
int rmtvaluelen2; /* remote attr value length in bytes */
- int op_flags; /* operation flags */
+ uint32_t op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
/*
* Operation flags:
*/
-#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
-#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
-#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
-#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
-#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
-#define XFS_DA_OP_NOTIME 0x0020 /* don't update inode timestamps */
+#define XFS_DA_OP_JUSTCHECK (1u << 0) /* check for ok with no space */
+#define XFS_DA_OP_REPLACE (1u << 1) /* this is an atomic replace op */
+#define XFS_DA_OP_ADDNAME (1u << 2) /* this is an add operation */
+#define XFS_DA_OP_OKNOENT (1u << 3) /* lookup op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP (1u << 4) /* lookup returns CI name if found */
+#define XFS_DA_OP_NOTIME (1u << 5) /* don't update inode timestamps */
+#define XFS_DA_OP_REMOVE (1u << 6) /* this is a remove operation */
+#define XFS_DA_OP_RECOVERY (1u << 7) /* Log recovery operation */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
- { XFS_DA_OP_RENAME, "RENAME" }, \
+ { XFS_DA_OP_REPLACE, "REPLACE" }, \
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_NOTIME, "NOTIME" }
+ { XFS_DA_OP_NOTIME, "NOTIME" }, \
+ { XFS_DA_OP_REMOVE, "REMOVE" }, \
+ { XFS_DA_OP_RECOVERY, "RECOVERY" }
/*
* Storage for holding state during Btree searches and split/join ops.
@@ -197,7 +202,7 @@ int xfs_da3_node_read_mapped(struct xfs_trans *tp, struct xfs_inode *dp,
* Utility routines.
*/
-#define XFS_DABUF_MAP_HOLE_OK (1 << 0)
+#define XFS_DABUF_MAP_HOLE_OK (1u << 0)
int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno);
int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 5a49caa5c9df..25e2841084e1 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
* Directory address space divided into sections,
* spaces separated by 32GB.
*/
+#define XFS_DIR2_MAX_SPACES 3
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
@@ -688,10 +689,10 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
+#define XFS_ATTR_LOCAL (1u << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
/*
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 0805ade2d300..ceb222b4f261 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -22,6 +22,10 @@
#include "xfs_refcount.h"
#include "xfs_bmap.h"
#include "xfs_alloc.h"
+#include "xfs_buf.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -184,9 +188,10 @@ static const struct xfs_defer_op_type *defer_op_types[] = {
[XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
[XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
[XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+ [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
};
-static void
+static bool
xfs_defer_create_intent(
struct xfs_trans *tp,
struct xfs_defer_pending *dfp,
@@ -197,6 +202,7 @@ xfs_defer_create_intent(
if (!dfp->dfp_intent)
dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
dfp->dfp_count, sort);
+ return dfp->dfp_intent != NULL;
}
/*
@@ -204,16 +210,18 @@ xfs_defer_create_intent(
* associated extents, then add the entire intake list to the end of
* the pending list.
*/
-STATIC void
+static bool
xfs_defer_create_intents(
struct xfs_trans *tp)
{
struct xfs_defer_pending *dfp;
+ bool ret = false;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- xfs_defer_create_intent(tp, dfp, true);
+ ret |= xfs_defer_create_intent(tp, dfp, true);
}
+ return ret;
}
/* Abort all the intents that were committed. */
@@ -487,7 +495,7 @@ int
xfs_defer_finish_noroll(
struct xfs_trans **tp)
{
- struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *dfp = NULL;
int error = 0;
LIST_HEAD(dop_pending);
@@ -506,17 +514,20 @@ xfs_defer_finish_noroll(
* of time that any one intent item can stick around in memory,
* pinning the log tail.
*/
- xfs_defer_create_intents(*tp);
+ bool has_intents = xfs_defer_create_intents(*tp);
+
list_splice_init(&(*tp)->t_dfops, &dop_pending);
- error = xfs_defer_trans_roll(tp);
- if (error)
- goto out_shutdown;
+ if (has_intents || dfp) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
- /* Possibly relog intent items to keep the log moving. */
- error = xfs_defer_relog(tp, &dop_pending);
- if (error)
- goto out_shutdown;
+ /* Relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
+ }
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
@@ -774,17 +785,25 @@ xfs_defer_ops_continue(
struct xfs_trans *tp,
struct xfs_defer_resources *dres)
{
+ unsigned int i;
+
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
- /* Lock and join the captured inode to the new transaction. */
+ /* Lock the captured resources to the new transaction. */
if (dfc->dfc_held.dr_inos == 2)
xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
else if (dfc->dfc_held.dr_inos == 1)
xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
+
+ for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
+ xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
+
+ /* Join the captured resources to the new transaction. */
xfs_defer_restore_resources(tp, &dfc->dfc_held);
memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
+ dres->dr_bufs = 0;
/* Move captured dfops chain and state to the transaction. */
list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
@@ -854,7 +873,12 @@ xfs_defer_init_item_caches(void)
error = xfs_extfree_intent_init_cache();
if (error)
goto err;
-
+ error = xfs_attri_init_cache();
+ if (error)
+ goto err;
+ error = xfs_attrd_init_cache();
+ if (error)
+ goto err;
return 0;
err:
xfs_defer_destroy_item_caches();
@@ -865,6 +889,8 @@ err:
void
xfs_defer_destroy_item_caches(void)
{
+ xfs_attri_destroy_cache();
+ xfs_attrd_destroy_cache();
xfs_extfree_intent_destroy_cache();
xfs_bmap_intent_destroy_cache();
xfs_refcount_intent_destroy_cache();
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 7bb8a31ad65b..114a3a4930a3 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -19,6 +19,7 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ XFS_DEFER_OPS_TYPE_ATTR,
XFS_DEFER_OPS_TYPE_MAX,
};
@@ -63,6 +64,8 @@ extern const struct xfs_defer_op_type xfs_refcount_update_defer_type;
extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+extern const struct xfs_defer_op_type xfs_attr_defer_type;
+
/*
* Deferred operation item relogging limits.
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 5f1e4799e8fa..3cd51fa3837b 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -150,6 +150,8 @@ xfs_da_mount(
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+ dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
+ mp->m_sb.sb_blocklog;
dageo->magicpct = (dageo->blksize * 37) / 100;
/* set up attribute geometry - single fsb only */
@@ -161,6 +163,12 @@ xfs_da_mount(
dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+
+ if (xfs_has_large_extent_counts(mp))
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ else
+ dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
dageo->magicpct = (dageo->blksize * 37) / 100;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index a23a52e643ad..5362908164b0 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -59,7 +59,10 @@
#define XFS_ERRTAG_REDUCE_MAX_IEXTENTS 36
#define XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT 37
#define XFS_ERRTAG_AG_RESV_FAIL 38
-#define XFS_ERRTAG_MAX 39
+#define XFS_ERRTAG_LARP 39
+#define XFS_ERRTAG_DA_LEAF_SPLIT 40
+#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
+#define XFS_ERRTAG_MAX 42
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -103,5 +106,8 @@
#define XFS_RANDOM_REDUCE_MAX_IEXTENTS 1
#define XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT 1
#define XFS_RANDOM_AG_RESV_FAIL 1
+#define XFS_RANDOM_LARP 1
+#define XFS_RANDOM_DA_LEAF_SPLIT 1
+#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index d665c04e69dd..afdfc8108c5f 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -372,12 +372,14 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_META_UUID (1 << 2) /* metadata UUID */
#define XFS_SB_FEAT_INCOMPAT_BIGTIME (1 << 3) /* large timestamps */
#define XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR (1 << 4) /* needs xfs_repair */
+#define XFS_SB_FEAT_INCOMPAT_NREXT64 (1 << 5) /* large extent counters */
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE| \
XFS_SB_FEAT_INCOMPAT_SPINODES| \
XFS_SB_FEAT_INCOMPAT_META_UUID| \
XFS_SB_FEAT_INCOMPAT_BIGTIME| \
- XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
+ XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR| \
+ XFS_SB_FEAT_INCOMPAT_NREXT64)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -388,7 +390,9 @@ xfs_sb_has_incompat_feature(
return (sbp->sb_features_incompat & feature) != 0;
}
-#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0
+#define XFS_SB_FEAT_INCOMPAT_LOG_XATTRS (1 << 0) /* Delayed Attributes */
+#define XFS_SB_FEAT_INCOMPAT_LOG_ALL \
+ (XFS_SB_FEAT_INCOMPAT_LOG_XATTRS)
#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL
static inline bool
xfs_sb_has_incompat_log_feature(
@@ -413,6 +417,11 @@ xfs_sb_add_incompat_log_features(
sbp->sb_features_log_incompat |= features;
}
+static inline bool xfs_sb_version_haslogxattrs(struct xfs_sb *sbp)
+{
+ return xfs_sb_is_v5(sbp) && (sbp->sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+}
static inline bool
xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
@@ -525,26 +534,26 @@ typedef struct xfs_agf {
#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
-#define XFS_AGF_MAGICNUM 0x00000001
-#define XFS_AGF_VERSIONNUM 0x00000002
-#define XFS_AGF_SEQNO 0x00000004
-#define XFS_AGF_LENGTH 0x00000008
-#define XFS_AGF_ROOTS 0x00000010
-#define XFS_AGF_LEVELS 0x00000020
-#define XFS_AGF_FLFIRST 0x00000040
-#define XFS_AGF_FLLAST 0x00000080
-#define XFS_AGF_FLCOUNT 0x00000100
-#define XFS_AGF_FREEBLKS 0x00000200
-#define XFS_AGF_LONGEST 0x00000400
-#define XFS_AGF_BTREEBLKS 0x00000800
-#define XFS_AGF_UUID 0x00001000
-#define XFS_AGF_RMAP_BLOCKS 0x00002000
-#define XFS_AGF_REFCOUNT_BLOCKS 0x00004000
-#define XFS_AGF_REFCOUNT_ROOT 0x00008000
-#define XFS_AGF_REFCOUNT_LEVEL 0x00010000
-#define XFS_AGF_SPARE64 0x00020000
+#define XFS_AGF_MAGICNUM (1u << 0)
+#define XFS_AGF_VERSIONNUM (1u << 1)
+#define XFS_AGF_SEQNO (1u << 2)
+#define XFS_AGF_LENGTH (1u << 3)
+#define XFS_AGF_ROOTS (1u << 4)
+#define XFS_AGF_LEVELS (1u << 5)
+#define XFS_AGF_FLFIRST (1u << 6)
+#define XFS_AGF_FLLAST (1u << 7)
+#define XFS_AGF_FLCOUNT (1u << 8)
+#define XFS_AGF_FREEBLKS (1u << 9)
+#define XFS_AGF_LONGEST (1u << 10)
+#define XFS_AGF_BTREEBLKS (1u << 11)
+#define XFS_AGF_UUID (1u << 12)
+#define XFS_AGF_RMAP_BLOCKS (1u << 13)
+#define XFS_AGF_REFCOUNT_BLOCKS (1u << 14)
+#define XFS_AGF_REFCOUNT_ROOT (1u << 15)
+#define XFS_AGF_REFCOUNT_LEVEL (1u << 16)
+#define XFS_AGF_SPARE64 (1u << 17)
#define XFS_AGF_NUM_BITS 18
-#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
+#define XFS_AGF_ALL_BITS ((1u << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
{ XFS_AGF_MAGICNUM, "MAGICNUM" }, \
@@ -619,22 +628,22 @@ typedef struct xfs_agi {
#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
-#define XFS_AGI_MAGICNUM (1 << 0)
-#define XFS_AGI_VERSIONNUM (1 << 1)
-#define XFS_AGI_SEQNO (1 << 2)
-#define XFS_AGI_LENGTH (1 << 3)
-#define XFS_AGI_COUNT (1 << 4)
-#define XFS_AGI_ROOT (1 << 5)
-#define XFS_AGI_LEVEL (1 << 6)
-#define XFS_AGI_FREECOUNT (1 << 7)
-#define XFS_AGI_NEWINO (1 << 8)
-#define XFS_AGI_DIRINO (1 << 9)
-#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_MAGICNUM (1u << 0)
+#define XFS_AGI_VERSIONNUM (1u << 1)
+#define XFS_AGI_SEQNO (1u << 2)
+#define XFS_AGI_LENGTH (1u << 3)
+#define XFS_AGI_COUNT (1u << 4)
+#define XFS_AGI_ROOT (1u << 5)
+#define XFS_AGI_LEVEL (1u << 6)
+#define XFS_AGI_FREECOUNT (1u << 7)
+#define XFS_AGI_NEWINO (1u << 8)
+#define XFS_AGI_DIRINO (1u << 9)
+#define XFS_AGI_UNLINKED (1u << 10)
#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
-#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
-#define XFS_AGI_FREE_ROOT (1 << 11)
-#define XFS_AGI_FREE_LEVEL (1 << 12)
-#define XFS_AGI_IBLOCKS (1 << 13) /* both inobt/finobt block counters */
+#define XFS_AGI_ALL_BITS_R1 ((1u << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1u << 11)
+#define XFS_AGI_FREE_LEVEL (1u << 12)
+#define XFS_AGI_IBLOCKS (1u << 13) /* both inobt/finobt block counters */
#define XFS_AGI_NUM_BITS_R2 14
/* disk block (xfs_daddr_t) in the AG */
@@ -791,16 +800,41 @@ struct xfs_dinode {
__be32 di_nlink; /* number of links to file */
__be16 di_projid_lo; /* lower part of owner's project id */
__be16 di_projid_hi; /* higher part owner's project id */
- __u8 di_pad[6]; /* unused, zeroed space */
- __be16 di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ __be64 di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ __be64 di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ __u8 di_v2_pad[6];
+ __be16 di_flushiter;
+ };
+ };
xfs_timestamp_t di_atime; /* time last accessed */
xfs_timestamp_t di_mtime; /* time last modified */
xfs_timestamp_t di_ctime; /* time created/inode modified */
__be64 di_size; /* number of bytes in file */
__be64 di_nblocks; /* # of direct & btree blocks used */
__be32 di_extsize; /* basic/minimum extent size for file */
- __be32 di_nextents; /* number of extents in data fork */
- __be16 di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ __be32 di_nextents;
+ __be16 di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ __be32 di_big_anextents;
+ __be16 di_nrext64_pad;
+ } __packed;
+ } __packed;
__u8 di_forkoff; /* attr fork offs, <<3 for 64b align */
__s8 di_aformat; /* format of attr fork's data */
__be32 di_dmevmask; /* DMIG event mask */
@@ -870,6 +904,56 @@ enum xfs_dinode_fmt {
{ XFS_DINODE_FMT_UUID, "uuid" }
/*
+ * Max values for extnum and aextnum.
+ *
+ * The original on-disk extent counts were held in signed fields, resulting in
+ * maximum extent counts of 2^31 and 2^15 for the data and attr forks
+ * respectively. Similarly the maximum extent length is limited to 2^21 blocks
+ * by the 21-bit wide blockcount field of a BMBT extent record.
+ *
+ * The newly introduced data fork extent counter can hold a 64-bit value,
+ * however the maximum number of extents in a file is also limited to 2^54
+ * extents by the 54-bit wide startoff field of a BMBT extent record.
+ *
+ * It is further limited by the maximum supported file size of 2^63
+ * *bytes*. This leads to a maximum extent count for maximally sized filesystem
+ * blocks (64kB) of:
+ *
+ * 2^63 bytes / 2^16 bytes per block = 2^47 blocks
+ *
+ * Rounding up 47 to the nearest multiple of bits-per-byte results in 48. Hence
+ * 2^48 was chosen as the maximum data fork extent count.
+ *
+ * The maximum file size that can be represented by the data fork extent counter
+ * in the worst case occurs when all extents are 1 block in length and each
+ * block is 1KB in size.
+ *
+ * With XFS_MAX_EXTCNT_DATA_FORK_SMALL representing maximum extent count and
+ * with 1KB sized blocks, a file can reach upto,
+ * 1KB * (2^31) = 2TB
+ *
+ * This is much larger than the theoretical maximum size of a directory
+ * i.e. XFS_DIR2_SPACE_SIZE * XFS_DIR2_MAX_SPACES = ~96GB.
+ *
+ * Hence, a directory inode can never overflow its data fork extent counter.
+ */
+#define XFS_MAX_EXTCNT_DATA_FORK_LARGE ((xfs_extnum_t)((1ULL << 48) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_LARGE ((xfs_extnum_t)((1ULL << 32) - 1))
+#define XFS_MAX_EXTCNT_DATA_FORK_SMALL ((xfs_extnum_t)((1ULL << 31) - 1))
+#define XFS_MAX_EXTCNT_ATTR_FORK_SMALL ((xfs_extnum_t)((1ULL << 15) - 1))
+
+/*
+ * When we upgrade an inode to the large extent counts, the maximum value by
+ * which the extent count can increase is bound by the change in size of the
+ * on-disk field. No upgrade operation should ever be adding more than a few
+ * tens of extents, so if we get a really large value it is a sign of a code bug
+ * or corruption.
+ */
+#define XFS_MAX_EXTCNT_UPGRADE_NR \
+ min(XFS_MAX_EXTCNT_ATTR_FORK_LARGE - XFS_MAX_EXTCNT_ATTR_FORK_SMALL, \
+ XFS_MAX_EXTCNT_DATA_FORK_LARGE - XFS_MAX_EXTCNT_DATA_FORK_SMALL)
+
+/*
* Inode minimum and maximum sizes.
*/
#define XFS_DINODE_MIN_LOG 8
@@ -918,10 +1002,6 @@ enum xfs_dinode_fmt {
((w) == XFS_DATA_FORK ? \
(dip)->di_format : \
(dip)->di_aformat)
-#define XFS_DFORK_NEXTENTS(dip,w) \
- ((w) == XFS_DATA_FORK ? \
- be32_to_cpu((dip)->di_nextents) : \
- be16_to_cpu((dip)->di_anextents))
/*
* For block and character special files the 32bit dev_t is stored at the
@@ -988,15 +1068,17 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG2_REFLINK_BIT 1 /* file's blocks may be shared */
#define XFS_DIFLAG2_COWEXTSIZE_BIT 2 /* copy on write extent size hint */
#define XFS_DIFLAG2_BIGTIME_BIT 3 /* big timestamps */
+#define XFS_DIFLAG2_NREXT64_BIT 4 /* large extent counters */
#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT)
#define XFS_DIFLAG2_REFLINK (1 << XFS_DIFLAG2_REFLINK_BIT)
#define XFS_DIFLAG2_COWEXTSIZE (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
#define XFS_DIFLAG2_BIGTIME (1 << XFS_DIFLAG2_BIGTIME_BIT)
+#define XFS_DIFLAG2_NREXT64 (1 << XFS_DIFLAG2_NREXT64_BIT)
#define XFS_DIFLAG2_ANY \
(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
- XFS_DIFLAG2_BIGTIME)
+ XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64)
static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
{
@@ -1004,6 +1086,13 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
(dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_BIGTIME));
}
+static inline bool xfs_dinode_has_large_extent_counts(
+ const struct xfs_dinode *dip)
+{
+ return dip->di_version >= 3 &&
+ (dip->di_flags2 & cpu_to_be64(XFS_DIFLAG2_NREXT64));
+}
+
/*
* Inode number format:
* low inopblog bits - offset in block
@@ -1085,10 +1174,10 @@ static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */
#define XFS_DQUOT_VERSION (uint8_t)0x01 /* latest version number */
-#define XFS_DQTYPE_USER 0x01 /* user dquot record */
-#define XFS_DQTYPE_PROJ 0x02 /* project dquot record */
-#define XFS_DQTYPE_GROUP 0x04 /* group dquot record */
-#define XFS_DQTYPE_BIGTIME 0x80 /* large expiry timestamps */
+#define XFS_DQTYPE_USER (1u << 0) /* user dquot record */
+#define XFS_DQTYPE_PROJ (1u << 1) /* project dquot record */
+#define XFS_DQTYPE_GROUP (1u << 2) /* group dquot record */
+#define XFS_DQTYPE_BIGTIME (1u << 7) /* large expiry timestamps */
/* bitmask to determine if this is a user/group/project dquot */
#define XFS_DQTYPE_REC_MASK (XFS_DQTYPE_USER | \
@@ -1596,6 +1685,8 @@ typedef struct xfs_bmdr_block {
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+#define XFS_MAX_BMBT_EXTLEN ((xfs_extlen_t)(BMBT_BLOCKCOUNT_MASK))
+
/*
* bmbt records have a file offset (block) field that is 54 bits wide, so this
* is the largest xfs_fileoff_t that we ever expect to see.
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 505533c43a92..1cfd5bc6520a 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_REFLINK (1 << 20) /* files can share blocks */
#define XFS_FSOP_GEOM_FLAGS_BIGTIME (1 << 21) /* 64-bit nsec timestamps */
#define XFS_FSOP_GEOM_FLAGS_INOBTCNT (1 << 22) /* inobt btree counter */
+#define XFS_FSOP_GEOM_FLAGS_NREXT64 (1 << 23) /* large extent counters */
/*
* Minimum and maximum sizes need for growth checks.
@@ -377,7 +378,7 @@ struct xfs_bulkstat {
uint32_t bs_extsize_blks; /* extent size hint, blocks */
uint32_t bs_nlink; /* number of links */
- uint32_t bs_extents; /* number of extents */
+ uint32_t bs_extents; /* 32-bit data fork extent counter */
uint32_t bs_aextents; /* attribute number of extents */
uint16_t bs_version; /* structure version */
uint16_t bs_forkoff; /* inode fork offset in bytes */
@@ -386,8 +387,9 @@ struct xfs_bulkstat {
uint16_t bs_checked; /* checked inode metadata */
uint16_t bs_mode; /* type and mode */
uint16_t bs_pad2; /* zeroed */
+ uint64_t bs_extents64; /* 64-bit data fork extent counter */
- uint64_t bs_pad[7]; /* zeroed */
+ uint64_t bs_pad[6]; /* zeroed */
};
#define XFS_BULKSTAT_VERSION_V1 (1)
@@ -459,17 +461,28 @@ struct xfs_bulk_ireq {
* Only return results from the specified @agno. If @ino is zero, start
* with the first inode of @agno.
*/
-#define XFS_BULK_IREQ_AGNO (1 << 0)
+#define XFS_BULK_IREQ_AGNO (1U << 0)
/*
* Return bulkstat information for a single inode, where @ino value is a
* special value, not a literal inode number. See the XFS_BULK_IREQ_SPECIAL_*
* values below. Not compatible with XFS_BULK_IREQ_AGNO.
*/
-#define XFS_BULK_IREQ_SPECIAL (1 << 1)
+#define XFS_BULK_IREQ_SPECIAL (1U << 1)
-#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
- XFS_BULK_IREQ_SPECIAL)
+/*
+ * Return data fork extent count via xfs_bulkstat->bs_extents64 field and assign
+ * 0 to xfs_bulkstat->bs_extents when the flag is set. Otherwise, use
+ * xfs_bulkstat->bs_extents for returning data fork extent count and set
+ * xfs_bulkstat->bs_extents64 to 0. In the second case, return -EOVERFLOW and
+ * assign 0 to xfs_bulkstat->bs_extents if data fork extent count is larger than
+ * XFS_MAX_EXTCNT_DATA_FORK_OLD.
+ */
+#define XFS_BULK_IREQ_NREXT64 (1U << 2)
+
+#define XFS_BULK_IREQ_FLAGS_ALL (XFS_BULK_IREQ_AGNO | \
+ XFS_BULK_IREQ_SPECIAL | \
+ XFS_BULK_IREQ_NREXT64)
/* Operate on the root directory inode. */
#define XFS_BULK_IREQ_SPECIAL_ROOT (1)
@@ -699,34 +712,34 @@ struct xfs_scrub_metadata {
#define XFS_SCRUB_TYPE_NR 25
/* i: Repair this metadata. */
-#define XFS_SCRUB_IFLAG_REPAIR (1 << 0)
+#define XFS_SCRUB_IFLAG_REPAIR (1u << 0)
/* o: Metadata object needs repair. */
-#define XFS_SCRUB_OFLAG_CORRUPT (1 << 1)
+#define XFS_SCRUB_OFLAG_CORRUPT (1u << 1)
/*
* o: Metadata object could be optimized. It's not corrupt, but
* we could improve on it somehow.
*/
-#define XFS_SCRUB_OFLAG_PREEN (1 << 2)
+#define XFS_SCRUB_OFLAG_PREEN (1u << 2)
/* o: Cross-referencing failed. */
-#define XFS_SCRUB_OFLAG_XFAIL (1 << 3)
+#define XFS_SCRUB_OFLAG_XFAIL (1u << 3)
/* o: Metadata object disagrees with cross-referenced metadata. */
-#define XFS_SCRUB_OFLAG_XCORRUPT (1 << 4)
+#define XFS_SCRUB_OFLAG_XCORRUPT (1u << 4)
/* o: Scan was not complete. */
-#define XFS_SCRUB_OFLAG_INCOMPLETE (1 << 5)
+#define XFS_SCRUB_OFLAG_INCOMPLETE (1u << 5)
/* o: Metadata object looked funny but isn't corrupt. */
-#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+#define XFS_SCRUB_OFLAG_WARNING (1u << 6)
/*
* o: IFLAG_REPAIR was set but metadata object did not need fixing or
* optimization and has therefore not been altered.
*/
-#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1u << 7)
#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b418fe0c0679..bf2f4bc89193 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2414,9 +2414,9 @@ out_drop:
*/
void
xfs_ialloc_log_agi(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* allocation group header buffer */
- int fields) /* bitmask of fields to log */
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ uint32_t fields)
{
int first; /* first byte number */
int last; /* last byte number */
@@ -2772,6 +2772,8 @@ xfs_ialloc_setup_geometry(
igeo->new_diflags2 = 0;
if (xfs_has_bigtime(mp))
igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
+ if (xfs_has_large_extent_counts(mp))
+ igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
/* Compute inode btree geometry. */
igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 8b5c2b709022..a7705b6a1fd3 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -60,7 +60,7 @@ void
xfs_ialloc_log_agi(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *bp, /* allocation group header buffer */
- int fields); /* bitmask of fields to log */
+ uint32_t fields); /* bitmask of fields to log */
/*
* Read in the allocation group header (inode allocation section)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cae9708c8587..3b1b63f9d886 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -279,6 +279,25 @@ xfs_inode_to_disk_ts(
return ts;
}
+static inline void
+xfs_inode_to_disk_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = cpu_to_be64(xfs_ifork_nextents(&ip->i_df));
+ to->di_big_anextents = cpu_to_be32(xfs_ifork_nextents(ip->i_afp));
+ /*
+ * We might be upgrading the inode to use larger extent counters
+ * than was previously used. Hence zero the unused field.
+ */
+ to->di_nrext64_pad = cpu_to_be16(0);
+ } else {
+ to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
+ to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
+ }
+}
+
void
xfs_inode_to_disk(
struct xfs_inode *ip,
@@ -296,7 +315,6 @@ xfs_inode_to_disk(
to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff);
to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16);
- memset(to->di_pad, 0, sizeof(to->di_pad));
to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime);
to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime);
@@ -307,8 +325,6 @@ xfs_inode_to_disk(
to->di_size = cpu_to_be64(ip->i_disk_size);
to->di_nblocks = cpu_to_be64(ip->i_nblocks);
to->di_extsize = cpu_to_be32(ip->i_extsize);
- to->di_nextents = cpu_to_be32(xfs_ifork_nextents(&ip->i_df));
- to->di_anextents = cpu_to_be16(xfs_ifork_nextents(ip->i_afp));
to->di_forkoff = ip->i_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = cpu_to_be16(ip->i_diflags);
@@ -323,11 +339,14 @@ xfs_inode_to_disk(
to->di_lsn = cpu_to_be64(lsn);
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = cpu_to_be16(ip->i_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_disk_iext_counters(ip, to);
}
static xfs_failaddr_t
@@ -336,20 +355,40 @@ xfs_dinode_verify_fork(
struct xfs_mount *mp,
int whichfork)
{
- uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t di_nextents;
+ xfs_extnum_t max_extents;
+ mode_t mode = be16_to_cpu(dip->di_mode);
+ uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+ uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork);
+
+ di_nextents = xfs_dfork_nextents(dip, whichfork);
+
+ /*
+ * For fork types that can contain local data, check that the fork
+ * format matches the size of local data contained within the fork.
+ *
+ * For all types, check that when the size says the should be in extent
+ * or btree format, the inode isn't claiming it is in local format.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
- switch (XFS_DFORK_FORMAT(dip, whichfork)) {
+ if (be64_to_cpu(dip->di_size) > fork_size &&
+ fork_format == XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ switch (fork_format) {
case XFS_DINODE_FMT_LOCAL:
/*
- * no local regular files yet
+ * No local regular files yet.
*/
- if (whichfork == XFS_DATA_FORK) {
- if (S_ISREG(be16_to_cpu(dip->di_mode)))
- return __this_address;
- if (be64_to_cpu(dip->di_size) >
- XFS_DFORK_SIZE(dip, mp, whichfork))
- return __this_address;
- }
+ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+ return __this_address;
if (di_nextents)
return __this_address;
break;
@@ -358,12 +397,11 @@ xfs_dinode_verify_fork(
return __this_address;
break;
case XFS_DINODE_FMT_BTREE:
- if (whichfork == XFS_ATTR_FORK) {
- if (di_nextents > MAXAEXTNUM)
- return __this_address;
- } else if (di_nextents > MAXEXTNUM) {
+ max_extents = xfs_iext_max_nextents(
+ xfs_dinode_has_large_extent_counts(dip),
+ whichfork);
+ if (di_nextents > max_extents)
return __this_address;
- }
break;
default:
return __this_address;
@@ -396,6 +434,24 @@ xfs_dinode_verify_forkoff(
return NULL;
}
+static xfs_failaddr_t
+xfs_dinode_verify_nrext64(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip)) {
+ if (!xfs_has_large_extent_counts(mp))
+ return __this_address;
+ if (dip->di_nrext64_pad != 0)
+ return __this_address;
+ } else if (dip->di_version >= 3) {
+ if (dip->di_v3_pad != 0)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -407,6 +463,9 @@ xfs_dinode_verify(
uint16_t flags;
uint64_t flags2;
uint64_t di_size;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
+ xfs_filblks_t nblocks;
if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
return __this_address;
@@ -437,10 +496,19 @@ xfs_dinode_verify(
if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0)
return __this_address;
+ fa = xfs_dinode_verify_nrext64(mp, dip);
+ if (fa)
+ return fa;
+
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
/* Fork checks carried over from xfs_iformat_fork */
- if (mode &&
- be32_to_cpu(dip->di_nextents) + be16_to_cpu(dip->di_anextents) >
- be64_to_cpu(dip->di_nblocks))
+ if (mode && nextents + naextents > nblocks)
+ return __this_address;
+
+ if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents)
return __this_address;
if (mode && XFS_DFORK_BOFF(dip) > mp->m_sb.sb_inodesize)
@@ -497,7 +565,7 @@ xfs_dinode_verify(
default:
return __this_address;
}
- if (dip->di_anextents)
+ if (naextents)
return __this_address;
}
@@ -639,7 +707,7 @@ xfs_inode_validate_extsize(
if (extsize_bytes % blocksize_bytes)
return __this_address;
- if (extsize > MAXEXTLEN)
+ if (extsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (!rt_flag && extsize > mp->m_sb.sb_agblocks / 2)
@@ -696,7 +764,7 @@ xfs_inode_validate_cowextsize(
if (cowextsize_bytes % mp->m_sb.sb_blocksize)
return __this_address;
- if (cowextsize > MAXEXTLEN)
+ if (cowextsize > XFS_MAX_BMBT_EXTLEN)
return __this_address;
if (cowextsize > mp->m_sb.sb_agblocks / 2)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9149f4f796fc..1a4cdf550f6d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -36,7 +36,7 @@ xfs_init_local_fork(
int64_t size)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
- int mem_size = size, real_size = 0;
+ int mem_size = size;
bool zero_terminate;
/*
@@ -50,8 +50,7 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- real_size = roundup(mem_size, 4);
- ifp->if_u1.if_data = kmem_alloc(real_size, KM_NOFS);
+ ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
memcpy(ifp->if_u1.if_data, data, size);
if (zero_terminate)
ifp->if_u1.if_data[size] = '\0';
@@ -105,7 +104,7 @@ xfs_iformat_extents(
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
int state = xfs_bmap_fork_to_state(whichfork);
- int nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+ xfs_extnum_t nex = xfs_dfork_nextents(dip, whichfork);
int size = nex * sizeof(xfs_bmbt_rec_t);
struct xfs_iext_cursor icur;
struct xfs_bmbt_rec *dp;
@@ -117,8 +116,8 @@ xfs_iformat_extents(
* we just bail out rather than crash in kmem_alloc() or memcpy() below.
*/
if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
- xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
- (unsigned long long) ip->i_ino, nex);
+ xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
+ ip->i_ino, nex);
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
"xfs_iformat_extents(1)", dip, sizeof(*dip),
__this_address);
@@ -230,7 +229,7 @@ xfs_iformat_data_fork(
* depend on it.
*/
ip->i_df.if_format = dip->di_format;
- ip->i_df.if_nextents = be32_to_cpu(dip->di_nextents);
+ ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
@@ -295,14 +294,14 @@ xfs_iformat_attr_fork(
struct xfs_inode *ip,
struct xfs_dinode *dip)
{
+ xfs_extnum_t naextents = xfs_dfork_attr_extents(dip);
int error = 0;
/*
* Initialize the extent count early, as the per-format routines may
* depend on it.
*/
- ip->i_afp = xfs_ifork_alloc(dip->di_aformat,
- be16_to_cpu(dip->di_anextents));
+ ip->i_afp = xfs_ifork_alloc(dip->di_aformat, naextents);
switch (ip->i_afp->if_format) {
case XFS_DINODE_FMT_LOCAL:
@@ -497,12 +496,7 @@ xfs_idata_realloc(
return;
}
- /*
- * For inline data, the underlying buffer must be a multiple of 4 bytes
- * in size so that it can be logged and stay on word boundaries.
- * We enforce that here.
- */
- ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, roundup(new_size, 4),
+ ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
GFP_NOFS | __GFP_NOFAIL);
ifp->if_bytes = new_size;
}
@@ -744,7 +738,8 @@ xfs_iext_count_may_overflow(
if (whichfork == XFS_COW_FORK)
return 0;
- max_exts = (whichfork == XFS_ATTR_FORK) ? MAXAEXTNUM : MAXEXTNUM;
+ max_exts = xfs_iext_max_nextents(xfs_inode_has_large_extent_counts(ip),
+ whichfork);
if (XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
max_exts = 10;
@@ -755,3 +750,27 @@ xfs_iext_count_may_overflow(
return 0;
}
+
+/*
+ * Upgrade this inode's extent counter fields to be able to handle a potential
+ * increase in the extent count by nr_to_add. Normally this is the same
+ * quantity that caused xfs_iext_count_may_overflow() to return -EFBIG.
+ */
+int
+xfs_iext_count_upgrade(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ uint nr_to_add)
+{
+ ASSERT(nr_to_add <= XFS_MAX_EXTCNT_UPGRADE_NR);
+
+ if (!xfs_has_large_extent_counts(ip->i_mount) ||
+ xfs_inode_has_large_extent_counts(ip) ||
+ XFS_TEST_ERROR(false, ip->i_mount, XFS_ERRTAG_REDUCE_MAX_IEXTENTS))
+ return -EFBIG;
+
+ ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 3d64a3acb0ed..4f68c1f20beb 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -21,9 +21,9 @@ struct xfs_ifork {
void *if_root; /* extent tree root */
char *if_data; /* inline file data */
} if_u1;
+ xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
- xfs_extnum_t if_nextents; /* # of extents in this fork */
};
/*
@@ -40,19 +40,6 @@ struct xfs_ifork {
#define XFS_IEXT_PUNCH_HOLE_CNT (1)
/*
- * Directory entry addition can cause the following,
- * 1. Data block can be added/removed.
- * A new extent can cause extent count to increase by 1.
- * 2. Free disk block can be added/removed.
- * Same behaviour as described above for Data block.
- * 3. Dabtree blocks.
- * XFS_DA_NODE_MAXDEPTH blocks can be added. Each of these can be new
- * extents. Hence extent count can increase by XFS_DA_NODE_MAXDEPTH.
- */
-#define XFS_IEXT_DIR_MANIP_CNT(mp) \
- ((XFS_DA_NODE_MAXDEPTH + 1 + 1) * (mp)->m_dir_geo->fsbcount)
-
-/*
* Adding/removing an xattr can cause XFS_DA_NODE_MAXDEPTH extents to
* be added. One extra extent for dabtree in case a local attr is
* large enough to cause a double split. It can also cause extent
@@ -133,6 +120,65 @@ static inline int8_t xfs_ifork_format(struct xfs_ifork *ifp)
return ifp->if_format;
}
+static inline xfs_extnum_t xfs_iext_max_nextents(bool has_large_extent_counts,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ case XFS_COW_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_DATA_FORK_LARGE;
+ return XFS_MAX_EXTCNT_DATA_FORK_SMALL;
+
+ case XFS_ATTR_FORK:
+ if (has_large_extent_counts)
+ return XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
+ return XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
+
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+static inline xfs_extnum_t
+xfs_dfork_data_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be64_to_cpu(dip->di_big_nextents);
+
+ return be32_to_cpu(dip->di_nextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_attr_extents(
+ struct xfs_dinode *dip)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ return be32_to_cpu(dip->di_big_anextents);
+
+ return be16_to_cpu(dip->di_anextents);
+}
+
+static inline xfs_extnum_t
+xfs_dfork_nextents(
+ struct xfs_dinode *dip,
+ int whichfork)
+{
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ return xfs_dfork_data_extents(dip);
+ case XFS_ATTR_FORK:
+ return xfs_dfork_attr_extents(dip);
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ return 0;
+}
+
struct xfs_ifork *xfs_ifork_alloc(enum xfs_dinode_fmt format,
xfs_extnum_t nextents);
struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
@@ -229,6 +275,8 @@ int xfs_ifork_verify_local_data(struct xfs_inode *ip);
int xfs_ifork_verify_local_attr(struct xfs_inode *ip);
int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
int nr_to_add);
+int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
+ uint nr_to_add);
/* returns true if the fork has extents but they are not read in yet. */
static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index b322db523d65..f7edd1ecf6d9 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -69,7 +69,6 @@ static inline uint xlog_get_cycle(char *ptr)
/* Log Clients */
#define XFS_TRANSACTION 0x69
-#define XFS_VOLUME 0x2
#define XFS_LOG 0xaa
#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */
@@ -114,7 +113,12 @@ struct xfs_unmount_log_format {
#define XLOG_REG_TYPE_CUD_FORMAT 24
#define XLOG_REG_TYPE_BUI_FORMAT 25
#define XLOG_REG_TYPE_BUD_FORMAT 26
-#define XLOG_REG_TYPE_MAX 26
+#define XLOG_REG_TYPE_ATTRI_FORMAT 27
+#define XLOG_REG_TYPE_ATTRD_FORMAT 28
+#define XLOG_REG_TYPE_ATTR_NAME 29
+#define XLOG_REG_TYPE_ATTR_VALUE 30
+#define XLOG_REG_TYPE_MAX 30
+
/*
* Flags to log operation header
@@ -237,6 +241,8 @@ typedef struct xfs_trans_header {
#define XFS_LI_CUD 0x1243
#define XFS_LI_BUI 0x1244 /* bmbt update intent */
#define XFS_LI_BUD 0x1245
+#define XFS_LI_ATTRI 0x1246 /* attr set/remove intent*/
+#define XFS_LI_ATTRD 0x1247 /* attr set/remove done */
#define XFS_LI_TYPE_DESC \
{ XFS_LI_EFI, "XFS_LI_EFI" }, \
@@ -252,7 +258,9 @@ typedef struct xfs_trans_header {
{ XFS_LI_CUI, "XFS_LI_CUI" }, \
{ XFS_LI_CUD, "XFS_LI_CUD" }, \
{ XFS_LI_BUI, "XFS_LI_BUI" }, \
- { XFS_LI_BUD, "XFS_LI_BUD" }
+ { XFS_LI_BUD, "XFS_LI_BUD" }, \
+ { XFS_LI_ATTRI, "XFS_LI_ATTRI" }, \
+ { XFS_LI_ATTRD, "XFS_LI_ATTRD" }
/*
* Inode Log Item Format definitions.
@@ -388,16 +396,41 @@ struct xfs_log_dinode {
uint32_t di_nlink; /* number of links to file */
uint16_t di_projid_lo; /* lower part of owner's project id */
uint16_t di_projid_hi; /* higher part of owner's project id */
- uint8_t di_pad[6]; /* unused, zeroed space */
- uint16_t di_flushiter; /* incremented on flush */
+ union {
+ /* Number of data fork extents if NREXT64 is set */
+ uint64_t di_big_nextents;
+
+ /* Padding for V3 inodes without NREXT64 set. */
+ uint64_t di_v3_pad;
+
+ /* Padding and inode flush counter for V2 inodes. */
+ struct {
+ uint8_t di_v2_pad[6]; /* V2 inode zeroed space */
+ uint16_t di_flushiter; /* V2 inode incremented on flush */
+ };
+ };
xfs_log_timestamp_t di_atime; /* time last accessed */
xfs_log_timestamp_t di_mtime; /* time last modified */
xfs_log_timestamp_t di_ctime; /* time created/inode modified */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
- xfs_extnum_t di_nextents; /* number of extents in data fork */
- xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
+ union {
+ /*
+ * For V2 inodes and V3 inodes without NREXT64 set, this
+ * is the number of data and attr fork extents.
+ */
+ struct {
+ uint32_t di_nextents;
+ uint16_t di_anextents;
+ } __packed;
+
+ /* Number of attr fork extents if NREXT64 is set. */
+ struct {
+ uint32_t di_big_anextents;
+ uint16_t di_nrext64_pad;
+ } __packed;
+ } __packed;
uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */
int8_t di_aformat; /* format of attr fork's data */
uint32_t di_dmevmask; /* DMIG event mask */
@@ -869,4 +902,36 @@ struct xfs_icreate_log {
__be32 icl_gen; /* inode generation number to use */
};
+/*
+ * Flags for deferred attribute operations.
+ * Upper bits are flags, lower byte is type code
+ */
+#define XFS_ATTR_OP_FLAGS_SET 1 /* Set the attribute */
+#define XFS_ATTR_OP_FLAGS_REMOVE 2 /* Remove the attribute */
+#define XFS_ATTR_OP_FLAGS_REPLACE 3 /* Replace the attribute */
+#define XFS_ATTR_OP_FLAGS_TYPE_MASK 0xFF /* Flags type mask */
+
+/*
+ * This is the structure used to lay out an attr log item in the
+ * log.
+ */
+struct xfs_attri_log_format {
+ uint16_t alfi_type; /* attri log item type */
+ uint16_t alfi_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfi_id; /* attri identifier */
+ uint64_t alfi_ino; /* the inode for this attr operation */
+ uint32_t alfi_op_flags; /* marks the op as a set or remove */
+ uint32_t alfi_name_len; /* attr name length */
+ uint32_t alfi_value_len; /* attr value length */
+ uint32_t alfi_attr_flags;/* attr flags */
+};
+
+struct xfs_attrd_log_format {
+ uint16_t alfd_type; /* attrd log item type */
+ uint16_t alfd_size; /* size of this item */
+ uint32_t __pad; /* pad to 64 bit aligned */
+ uint64_t alfd_alf_id; /* id of corresponding attri */
+};
+
#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index ff69a0000817..32e216255cb0 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -72,6 +72,8 @@ extern const struct xlog_recover_item_ops xlog_rui_item_ops;
extern const struct xlog_recover_item_ops xlog_rud_item_ops;
extern const struct xlog_recover_item_ops xlog_cui_item_ops;
extern const struct xlog_recover_item_ops xlog_cud_item_ops;
+extern const struct xlog_recover_item_ops xlog_attri_item_ops;
+extern const struct xlog_recover_item_ops xlog_attrd_item_ops;
/*
* Macros, structures, prototypes for internal log manager use.
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index 67798ff5e14e..9975b93a7412 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -14,6 +14,7 @@
#include "xfs_trans_space.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
+#include "xfs_trace.h"
/*
* Calculate the maximum length in bytes that would be required for a local
@@ -37,6 +38,65 @@ xfs_log_calc_max_attrsetm_res(
}
/*
+ * Compute an alternate set of log reservation sizes for use exclusively with
+ * minimum log size calculations.
+ */
+static void
+xfs_log_calc_trans_resv_for_minlogblocks(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resv)
+{
+ unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
+
+ /*
+ * In the early days of rmap+reflink, we always set the rmap maxlevels
+ * to 9 even if the AG was small enough that it would never grow to
+ * that height. Transaction reservation sizes influence the minimum
+ * log size calculation, which influences the size of the log that mkfs
+ * creates. Use the old value here to ensure that newly formatted
+ * small filesystems will mount on older kernels.
+ */
+ if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+ mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
+ xfs_trans_resv_calc(mp, resv);
+
+ if (xfs_has_reflink(mp)) {
+ /*
+ * In the early days of reflink, typical log operation counts
+ * were greatly overestimated.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ resv->tr_itruncate.tr_logcount =
+ XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+ } else if (xfs_has_rmapbt(mp)) {
+ /*
+ * In the early days of non-reflink rmap, the impact of rmapbt
+ * updates on log counts were not taken into account at all.
+ */
+ resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ }
+
+ /*
+ * In the early days of reflink, we did not use deferred refcount
+ * update log items, so log reservations must be recomputed using the
+ * old calculations.
+ */
+ resv->tr_write.tr_logres =
+ xfs_calc_write_reservation_minlogsize(mp);
+ resv->tr_itruncate.tr_logres =
+ xfs_calc_itruncate_reservation_minlogsize(mp);
+ resv->tr_qm_dqalloc.tr_logres =
+ xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
+
+ /* Put everything back the way it was. This goes at the end. */
+ mp->m_rmap_maxlevels = rmap_maxlevels;
+}
+
+/*
* Iterate over the log space reservation table to figure out and return
* the maximum one in terms of the pre-calculated values which were done
* at mount time.
@@ -46,19 +106,25 @@ xfs_log_get_max_trans_res(
struct xfs_mount *mp,
struct xfs_trans_res *max_resp)
{
+ struct xfs_trans_resv resv = {};
struct xfs_trans_res *resp;
struct xfs_trans_res *end_resp;
+ unsigned int i;
int log_space = 0;
int attr_space;
attr_space = xfs_log_calc_max_attrsetm_res(mp);
- resp = (struct xfs_trans_res *)M_RES(mp);
- end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
- for (; resp < end_resp; resp++) {
+ xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
+
+ resp = (struct xfs_trans_res *)&resv;
+ end_resp = (struct xfs_trans_res *)(&resv + 1);
+ for (i = 0; resp < end_resp; i++, resp++) {
int tmp = resp->tr_logcount > 1 ?
resp->tr_logres * resp->tr_logcount :
resp->tr_logres;
+
+ trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
if (log_space < tmp) {
log_space = tmp;
*max_resp = *resp; /* struct copy */
@@ -66,9 +132,10 @@ xfs_log_get_max_trans_res(
}
if (attr_space > log_space) {
- *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
+ *max_resp = resv.tr_attrsetm; /* struct copy */
max_resp->tr_logres = attr_space;
}
+ trace_xfs_log_get_max_trans_res(mp, max_resp);
}
/*
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index a02c5062f9b2..cb035da3f990 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -16,7 +16,6 @@
* and quota-limits. This is a waste in the common case, but hey ...
*/
typedef uint64_t xfs_qcnt_t;
-typedef uint16_t xfs_qwarncnt_t;
typedef uint8_t xfs_dqtype_t;
@@ -29,8 +28,8 @@ typedef uint8_t xfs_dqtype_t;
/*
* flags for q_flags field in the dquot.
*/
-#define XFS_DQFLAG_DIRTY (1 << 0) /* dquot is dirty */
-#define XFS_DQFLAG_FREEING (1 << 1) /* dquot is being torn down */
+#define XFS_DQFLAG_DIRTY (1u << 0) /* dquot is dirty */
+#define XFS_DQFLAG_FREEING (1u << 1) /* dquot is being torn down */
#define XFS_DQFLAG_STRINGS \
{ XFS_DQFLAG_DIRTY, "DIRTY" }, \
@@ -73,29 +72,45 @@ typedef uint8_t xfs_dqtype_t;
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
*/
-#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
-#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
-#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
-#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
-#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
+#define XFS_QMOPT_UQUOTA (1u << 0) /* user dquot requested */
+#define XFS_QMOPT_GQUOTA (1u << 1) /* group dquot requested */
+#define XFS_QMOPT_PQUOTA (1u << 2) /* project dquot requested */
+#define XFS_QMOPT_FORCE_RES (1u << 3) /* ignore quota limits */
+#define XFS_QMOPT_SBVERSION (1u << 4) /* change superblock version num */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
* modified.
*/
-#define XFS_QMOPT_RES_REGBLKS 0x0010000
-#define XFS_QMOPT_RES_RTBLKS 0x0020000
-#define XFS_QMOPT_BCOUNT 0x0040000
-#define XFS_QMOPT_ICOUNT 0x0080000
-#define XFS_QMOPT_RTBCOUNT 0x0100000
-#define XFS_QMOPT_DELBCOUNT 0x0200000
-#define XFS_QMOPT_DELRTBCOUNT 0x0400000
-#define XFS_QMOPT_RES_INOS 0x0800000
+#define XFS_QMOPT_RES_REGBLKS (1u << 7)
+#define XFS_QMOPT_RES_RTBLKS (1u << 8)
+#define XFS_QMOPT_BCOUNT (1u << 9)
+#define XFS_QMOPT_ICOUNT (1u << 10)
+#define XFS_QMOPT_RTBCOUNT (1u << 11)
+#define XFS_QMOPT_DELBCOUNT (1u << 12)
+#define XFS_QMOPT_DELRTBCOUNT (1u << 13)
+#define XFS_QMOPT_RES_INOS (1u << 14)
/*
* flags for dqalloc.
*/
-#define XFS_QMOPT_INHERIT 0x1000000
+#define XFS_QMOPT_INHERIT (1u << 31)
+
+#define XFS_QMOPT_FLAGS \
+ { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
+ { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
+ { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
+ { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
+ { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
+ { XFS_QMOPT_INHERIT, "INHERIT" }, \
+ { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
+ { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
+ { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
+ { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
+ { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
+ { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
+ { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
+ { XFS_QMOPT_RES_INOS, "RES_INOS" }
/*
* flags to xfs_trans_mod_dquot.
@@ -114,6 +129,7 @@ typedef uint8_t xfs_dqtype_t;
(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+
extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
struct xfs_disk_dquot *ddq, xfs_dqid_t id);
extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 327ba25e9e17..97e9e6020596 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -886,8 +886,13 @@ xfs_refcount_still_have_space(
{
unsigned long overhead;
- overhead = cur->bc_ag.refc.shape_changes *
- xfs_allocfree_log_count(cur->bc_mp, 1);
+ /*
+ * Worst case estimate: full splits of the free space and rmap btrees
+ * to handle each of the shape changes to the refcount btree.
+ */
+ overhead = xfs_allocfree_block_count(cur->bc_mp,
+ cur->bc_ag.refc.shape_changes);
+ overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
@@ -960,6 +965,7 @@ xfs_refcount_adjust_extents(
* Either cover the hole (increment) or
* delete the range (decrement).
*/
+ cur->bc_ag.refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
@@ -970,7 +976,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_ag.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
@@ -1001,11 +1006,11 @@ xfs_refcount_adjust_extents(
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &ext);
+ cur->bc_ag.refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
goto out_error;
- cur->bc_ag.refc.nr_ops++;
} else if (ext.rc_refcount == 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
@@ -1014,7 +1019,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
- cur->bc_ag.refc.nr_ops++;
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 9eb01edbd89d..e8b322de7f3d 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
* log (plus any key updates) so we'll conservatively assume 32 bytes
* per record. We must also leave space for btree splits on both ends
* of the range and space for the CUD and a new CUI.
+ *
+ * Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
+ * This is a low estimate for an EFI tracking a single extent (16 bytes for the
+ * EFI header, 16 for the extent, and 12 for the xlog op header), but the
+ * estimate is acceptable if there's more than one extent being freed.
+ * In the worst case of freeing every other block during a refcount decrease
+ * operation, we amortize the space used for one EFI log item across 16
+ * extents.
*/
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
-static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
-{
- return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
-}
-
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index cd322174dbff..2845019d31da 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -34,18 +34,32 @@ int
xfs_rmap_lookup_le(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
- xfs_extlen_t len,
uint64_t owner,
uint64_t offset,
unsigned int flags,
+ struct xfs_rmap_irec *irec,
int *stat)
{
+ int get_stat = 0;
+ int error;
+
cur->bc_rec.r.rm_startblock = bno;
- cur->bc_rec.r.rm_blockcount = len;
+ cur->bc_rec.r.rm_blockcount = 0;
cur->bc_rec.r.rm_owner = owner;
cur->bc_rec.r.rm_offset = offset;
cur->bc_rec.r.rm_flags = flags;
- return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+ if (error || !(*stat) || !irec)
+ return error;
+
+ error = xfs_rmap_get_rec(cur, irec, &get_stat);
+ if (error)
+ return error;
+ if (!get_stat)
+ return -EFSCORRUPTED;
+
+ return 0;
}
/*
@@ -251,7 +265,6 @@ out_bad_rec:
struct xfs_find_left_neighbor_info {
struct xfs_rmap_irec high;
struct xfs_rmap_irec *irec;
- int *stat;
};
/* For each rmap given, figure out if it matches the key we want. */
@@ -276,7 +289,6 @@ xfs_rmap_find_left_neighbor_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -285,7 +297,7 @@ xfs_rmap_find_left_neighbor_helper(
* return a match with the same owner and adjacent physical and logical
* block ranges.
*/
-int
+STATIC int
xfs_rmap_find_left_neighbor(
struct xfs_btree_cur *cur,
xfs_agblock_t bno,
@@ -296,6 +308,7 @@ xfs_rmap_find_left_neighbor(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
*stat = 0;
@@ -313,21 +326,44 @@ xfs_rmap_find_left_neighbor(
info.high.rm_flags = flags;
info.high.rm_blockcount = 0;
info.irec = irec;
- info.stat = stat;
trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_find_left_neighbor_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * extent conversion and remap operations run a bit faster if the
+ * physical extents aren't being shared. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_find_left_neighbor_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_find_left_neighbor_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/* For each rmap given, figure out if it matches the key we want. */
@@ -353,7 +389,6 @@ xfs_rmap_lookup_le_range_helper(
return 0;
*info->irec = *rec;
- *info->stat = 1;
return -ECANCELED;
}
@@ -374,6 +409,7 @@ xfs_rmap_lookup_le_range(
int *stat)
{
struct xfs_find_left_neighbor_info info;
+ int found = 0;
int error;
info.high.rm_startblock = bno;
@@ -386,20 +422,44 @@ xfs_rmap_lookup_le_range(
info.high.rm_blockcount = 0;
*stat = 0;
info.irec = irec;
- info.stat = stat;
- trace_xfs_rmap_lookup_le_range(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, bno, 0, owner, offset, flags);
- error = xfs_rmap_query_range(cur, &info.high, &info.high,
- xfs_rmap_lookup_le_range_helper, &info);
- if (error == -ECANCELED)
- error = 0;
- if (*stat)
- trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
- cur->bc_ag.pag->pag_agno, irec->rm_startblock,
- irec->rm_blockcount, irec->rm_owner,
- irec->rm_offset, irec->rm_flags);
- return error;
+ trace_xfs_rmap_lookup_le_range(cur->bc_mp, cur->bc_ag.pag->pag_agno,
+ bno, 0, owner, offset, flags);
+
+ /*
+ * Historically, we always used the range query to walk every reverse
+ * mapping that could possibly overlap the key that the caller asked
+ * for, and filter out the ones that don't. That is very slow when
+ * there are a lot of records.
+ *
+ * However, there are two scenarios where the classic btree search can
+ * produce correct results -- if the index contains a record that is an
+ * exact match for the lookup key; and if there are no other records
+ * between the record we want and the key we supplied.
+ *
+ * As an optimization, try a non-overlapped lookup first. This makes
+ * scrub run much faster on most filesystems because bmbt records are
+ * usually an exact match for rmap records. If we don't find what we
+ * want, we fall back to the overlapped query.
+ */
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, irec,
+ &found);
+ if (error)
+ return error;
+ if (found)
+ error = xfs_rmap_lookup_le_range_helper(cur, irec, &info);
+ if (!error)
+ error = xfs_rmap_query_range(cur, &info.high, &info.high,
+ xfs_rmap_lookup_le_range_helper, &info);
+ if (error != -ECANCELED)
+ return error;
+
+ *stat = 1;
+ trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+ cur->bc_ag.pag->pag_agno, irec->rm_startblock,
+ irec->rm_blockcount, irec->rm_owner, irec->rm_offset,
+ irec->rm_flags);
+ return 0;
}
/*
@@ -510,7 +570,7 @@ xfs_rmap_unmap(
* for the AG headers at rm_startblock == 0 created by mkfs/growfs that
* will not ever be removed from the tree.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec, &i);
if (error)
goto out_error;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -518,13 +578,6 @@ xfs_rmap_unmap(
goto out_error;
}
- error = xfs_rmap_get_rec(cur, &ltrec, &i);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -786,18 +839,11 @@ xfs_rmap_map(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &ltrec,
&have_lt);
if (error)
goto out_error;
if (have_lt) {
- error = xfs_rmap_get_rec(cur, &ltrec, &have_lt);
- if (error)
- goto out_error;
- if (XFS_IS_CORRUPT(mp, have_lt != 1)) {
- error = -EFSCORRUPTED;
- goto out_error;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, ltrec.rm_startblock,
ltrec.rm_blockcount, ltrec.rm_owner,
@@ -1022,7 +1068,7 @@ xfs_rmap_convert(
* record for our insertion point. This will also give us the record for
* start block contiguity tests.
*/
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, &PREV, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -1030,13 +1076,6 @@ xfs_rmap_convert(
goto done;
}
- error = xfs_rmap_get_rec(cur, &PREV, &i);
- if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
cur->bc_ag.pag->pag_agno, PREV.rm_startblock,
PREV.rm_blockcount, PREV.rm_owner,
@@ -1140,7 +1179,7 @@ xfs_rmap_convert(
_RET_IP_);
/* reset the cursor back to PREV */
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, oldext, &i);
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, oldext, NULL, &i);
if (error)
goto done;
if (XFS_IS_CORRUPT(mp, i != 1)) {
@@ -2677,7 +2716,7 @@ xfs_rmap_record_exists(
ASSERT(XFS_RMAP_NON_INODE_OWNER(owner) ||
(flags & XFS_RMAP_BMBT_BLOCK));
- error = xfs_rmap_lookup_le(cur, bno, len, owner, offset, flags,
+ error = xfs_rmap_lookup_le(cur, bno, owner, offset, flags, &irec,
&has_record);
if (error)
return error;
@@ -2686,14 +2725,6 @@ xfs_rmap_record_exists(
return 0;
}
- error = xfs_rmap_get_rec(cur, &irec, &has_record);
- if (error)
- return error;
- if (!has_record) {
- *has_rmap = false;
- return 0;
- }
-
*has_rmap = (irec.rm_owner == owner && irec.rm_startblock <= bno &&
irec.rm_startblock + irec.rm_blockcount >= bno + len);
return 0;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index b718ebeda372..54741a591a17 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -122,8 +122,8 @@ int xfs_rmap_free(struct xfs_trans *tp, struct xfs_buf *agbp,
const struct xfs_owner_info *oinfo);
int xfs_rmap_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, uint64_t owner, uint64_t offset,
- unsigned int flags, int *stat);
+ uint64_t owner, uint64_t offset, unsigned int flags,
+ struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, uint64_t owner, uint64_t offset,
unsigned int flags, int *stat);
@@ -184,9 +184,6 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
xfs_fsblock_t startblock, xfs_filblks_t blockcount,
xfs_exntst_t state, struct xfs_btree_cur **pcur);
-int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- uint64_t owner, uint64_t offset, unsigned int flags,
- struct xfs_rmap_irec *irec, int *stat);
int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
uint64_t owner, uint64_t offset, unsigned int flags,
struct xfs_rmap_irec *irec, int *stat);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 5740ba664867..fa180ab66b73 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1008,6 +1008,7 @@ xfs_rtfree_extent(
/* Find all the free records within a given range. */
int
xfs_rtalloc_query_range(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *low_rec,
const struct xfs_rtalloc_rec *high_rec,
@@ -1015,7 +1016,6 @@ xfs_rtalloc_query_range(
void *priv)
{
struct xfs_rtalloc_rec rec;
- struct xfs_mount *mp = tp->t_mountp;
xfs_rtblock_t rtstart;
xfs_rtblock_t rtend;
xfs_rtblock_t high_key;
@@ -1048,7 +1048,7 @@ xfs_rtalloc_query_range(
rec.ar_startext = rtstart;
rec.ar_extcount = rtend - rtstart + 1;
- error = fn(tp, &rec, priv);
+ error = fn(mp, tp, &rec, priv);
if (error)
break;
}
@@ -1062,6 +1062,7 @@ xfs_rtalloc_query_range(
/* Find all the free records. */
int
xfs_rtalloc_query_all(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv)
@@ -1069,10 +1070,10 @@ xfs_rtalloc_query_all(
struct xfs_rtalloc_rec keys[2];
keys[0].ar_startext = 0;
- keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+ keys[1].ar_startext = mp->m_sb.sb_rextents - 1;
keys[0].ar_extcount = keys[1].ar_extcount = 0;
- return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+ return xfs_rtalloc_query_range(mp, tp, &keys[0], &keys[1], fn, priv);
}
/* Is the given extent all free? */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index f4e84aa1d50a..a20cade590e9 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -31,15 +31,66 @@
*/
/*
+ * Check that all the V4 feature bits that the V5 filesystem format requires are
+ * correctly set.
+ */
+static bool
+xfs_sb_validate_v5_features(
+ struct xfs_sb *sbp)
+{
+ /* We must not have any unknown V4 feature bits set */
+ if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS)
+ return false;
+
+ /*
+ * The CRC bit is considered an invalid V4 flag, so we have to add it
+ * manually to the OKBITS mask.
+ */
+ if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS |
+ XFS_SB_VERSION2_CRCBIT))
+ return false;
+
+ /* Now check all the required V4 feature flags are set. */
+
+#define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \
+ XFS_SB_VERSION_ALIGNBIT | \
+ XFS_SB_VERSION_LOGV2BIT | \
+ XFS_SB_VERSION_EXTFLGBIT | \
+ XFS_SB_VERSION_DIRV2BIT | \
+ XFS_SB_VERSION_MOREBITSBIT)
+
+#define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
+ XFS_SB_VERSION2_ATTR2BIT | \
+ XFS_SB_VERSION2_PROJID32BIT | \
+ XFS_SB_VERSION2_CRCBIT)
+
+ if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS)
+ return false;
+ if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS)
+ return false;
+ return true;
+}
+
+/*
* We support all XFS versions newer than a v4 superblock with V2 directories.
*/
bool
xfs_sb_good_version(
struct xfs_sb *sbp)
{
- /* all v5 filesystems are supported */
+ /*
+ * All v5 filesystems are supported, but we must check that all the
+ * required v4 feature flags are enabled correctly as the code checks
+ * those flags and not for v5 support.
+ */
if (xfs_sb_is_v5(sbp))
- return true;
+ return xfs_sb_validate_v5_features(sbp);
+
+ /* We must not have any unknown v4 feature bits set */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
/* versions prior to v4 are not supported */
if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4)
@@ -51,12 +102,6 @@ xfs_sb_good_version(
if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
return false;
- /* And must not have any unknown v4 feature bits set */
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
- return false;
-
/* It's a supported v4 filesystem */
return true;
}
@@ -70,6 +115,8 @@ xfs_sb_version_to_features(
/* optional V4 features */
if (sbp->sb_rblocks > 0)
features |= XFS_FEAT_REALTIME;
+ if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT)
+ features |= XFS_FEAT_NLINK;
if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT)
features |= XFS_FEAT_ATTR;
if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT)
@@ -124,6 +171,9 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_BIGTIME;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR)
features |= XFS_FEAT_NEEDSREPAIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64)
+ features |= XFS_FEAT_NREXT64;
+
return features;
}
@@ -262,12 +312,15 @@ xfs_validate_sb_common(
bool has_dalign;
if (!xfs_verify_magic(bp, dsb->sb_magicnum)) {
- xfs_warn(mp, "bad magic number");
+ xfs_warn(mp,
+"Superblock has bad magic number 0x%x. Not an XFS filesystem?",
+ be32_to_cpu(dsb->sb_magicnum));
return -EWRONGFS;
}
if (!xfs_sb_good_version(sbp)) {
- xfs_warn(mp, "bad version");
+ xfs_warn(mp,
+"Superblock has unknown features enabled or corrupted feature masks.");
return -EWRONGFS;
}
@@ -911,6 +964,11 @@ xfs_log_sb(
* reservations that have been taken out percpu counters. If we have an
* unclean shutdown, this will be corrected by log recovery rebuilding
* the counters from the AGF block counts.
+ *
+ * Do not update sb_frextents here because it is not part of the lazy
+ * sb counters, despite having a percpu counter. It is always kept
+ * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas()
+ * and hence we don't need have to update it here.
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
@@ -1135,6 +1193,8 @@ xfs_fs_geometry(
} else {
geo->logsectsize = BBSIZE;
}
+ if (xfs_has_large_extent_counts(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 25c4cab58851..c4381388c0c1 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -54,13 +54,23 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
/*
* Values for t_flags.
*/
-#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
-#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
-#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
-#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
-#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
-#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */
+/* Transaction needs to be logged */
+#define XFS_TRANS_DIRTY (1u << 0)
+/* Superblock is dirty and needs to be logged */
+#define XFS_TRANS_SB_DIRTY (1u << 1)
+/* Transaction took a permanent log reservation */
+#define XFS_TRANS_PERM_LOG_RES (1u << 2)
+/* Synchronous transaction commit needed */
+#define XFS_TRANS_SYNC (1u << 3)
+/* Transaction can use reserve block pool */
+#define XFS_TRANS_RESERVE (1u << 4)
+/* Transaction should avoid VFS level superblock write accounting */
+#define XFS_TRANS_NO_WRITECOUNT (1u << 5)
+/* Transaction has freed blocks returned to it's reservation */
+#define XFS_TRANS_RES_FDBLKS (1u << 6)
+/* Transaction contains an intent done log item */
+#define XFS_TRANS_HAS_INTENT_DONE (1u << 7)
+
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6f83d9b306ee..e9913c2c5a24 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -56,15 +56,14 @@ xfs_calc_buf_res(
* Per-extent log reservation for the btree changes involved in freeing or
* allocating an extent. In classic XFS there were two trees that will be
* modified (bnobt + cntbt). With rmap enabled, there are three trees
- * (rmapbt). With reflink, there are four trees (refcountbt). The number of
- * blocks reserved is based on the formula:
+ * (rmapbt). The number of blocks reserved is based on the formula:
*
* num trees * ((2 blocks/level * max depth) - 1)
*
* Keep in mind that max depth is calculated separately for each type of tree.
*/
uint
-xfs_allocfree_log_count(
+xfs_allocfree_block_count(
struct xfs_mount *mp,
uint num_ops)
{
@@ -73,13 +72,24 @@ xfs_allocfree_log_count(
blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1);
if (xfs_has_rmapbt(mp))
blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
- if (xfs_has_reflink(mp))
- blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
return blocks;
}
/*
+ * Per-extent log reservation for refcount btree changes. These are never done
+ * in the same transaction as an allocation or a free, so we compute them
+ * separately.
+ */
+static unsigned int
+xfs_refcountbt_block_count(
+ struct xfs_mount *mp,
+ unsigned int num_ops)
+{
+ return num_ops * (2 * mp->m_refc_maxlevels - 1);
+}
+
+/*
* Logging inodes is really tricksy. They are logged in memory format,
* which means that what we write into the log doesn't directly translate into
* the amount of space they use on disk.
@@ -136,7 +146,7 @@ xfs_calc_inobt_res(
{
return xfs_calc_buf_res(M_IGEO(mp)->inobt_maxlevels,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -183,7 +193,7 @@ xfs_calc_inode_chunk_res(
{
uint res, size = 0;
- res = xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ res = xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
@@ -199,18 +209,18 @@ xfs_calc_inode_chunk_res(
/*
* Per-extent log reservation for the btree changes involved in freeing or
* allocating a realtime extent. We have to be able to log as many rtbitmap
- * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents,
- * as well as the realtime summary block.
+ * blocks as needed to mark inuse XFS_BMBT_MAX_EXTLEN blocks' worth of realtime
+ * extents, as well as the realtime summary block.
*/
static unsigned int
-xfs_rtalloc_log_count(
+xfs_rtalloc_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
{
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
unsigned int rtbmp_bytes;
- rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY;
+ rtbmp_bytes = (XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize) / NBBY;
return (howmany(rtbmp_bytes, blksz) + 1) * num_ops;
}
@@ -233,6 +243,28 @@ xfs_rtalloc_log_count(
* register overflow from temporaries in the calculations.
*/
+/*
+ * Compute the log reservation required to handle the refcount update
+ * transaction. Refcount updates are always done via deferred log items.
+ *
+ * This is calculated as:
+ * Data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+static unsigned int
+xfs_calc_refcountbt_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ unsigned int blksz = XFS_FSB_TO_B(mp, 1);
+
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
+}
/*
* In a write transaction we can allocate a maximum of 2
@@ -247,7 +279,7 @@ xfs_rtalloc_log_count(
* the inode's bmap btree: max depth * block size
* the agfs of the ags from which the extents are allocated: 2 * sector
* the superblock free block counter: sector size
- * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap: ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 1 block
* the allocation btrees: 2 trees * (2 * max depth - 1) * block size
* And the bmap_finish transaction can free bmap blocks in a join (t3):
@@ -255,34 +287,65 @@ xfs_rtalloc_log_count(
* the agfls of the ags containing the blocks: 2 * sector size
* the super block free block counter: sector size
* the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_write_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
if (xfs_has_realtime(mp)) {
t2 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
blksz) +
xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 1), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), blksz);
} else {
t2 = 0;
}
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * two refcountbt splits for each transaction. The codebase runs
+ * refcountbt updates in separate transactions now, so to compute the
+ * minimum log size, add the refcountbtree splits back to t1 and t3 and
+ * do not account them separately as t4. Reflink did not support
+ * realtime when the reservations were established, so no adjustment to
+ * t2 is needed.
+ */
+ if (for_minlogsize) {
+ unsigned int adj = 0;
+
+ if (xfs_has_reflink(mp))
+ adj = xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 2),
+ blksz);
+ t1 += adj;
+ t3 += adj;
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 1);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_write_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_write_reservation(mp, true);
}
/*
@@ -299,33 +362,62 @@ xfs_calc_write_reservation(
* the agf for each of the ags: 2 * sector size
* the agfl for each of the ags: 2 * sector size
* the super block to reflect the freed blocks: sector size
- * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
* the realtime summary: 2 exts * 1 block
* worst case split in allocation btrees per extent assuming 2 extents:
* 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And any refcount updates that happen in a separate transaction (t4).
*/
STATIC uint
xfs_calc_itruncate_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- unsigned int t1, t2, t3;
+ unsigned int t1, t2, t3, t4;
unsigned int blksz = XFS_FSB_TO_B(mp, 1);
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz);
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
if (xfs_has_realtime(mp)) {
t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz);
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
} else {
t3 = 0;
}
- return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ /*
+ * In the early days of reflink, we included enough reservation to log
+ * four refcountbt splits in the same transaction as bnobt/cntbt
+ * updates. The codebase runs refcountbt updates in separate
+ * transactions now, so to compute the minimum log size, add the
+ * refcount btree splits back here and do not compute them separately
+ * as t4. Reflink did not support realtime when the reservations were
+ * established, so do not adjust t3.
+ */
+ if (for_minlogsize) {
+ if (xfs_has_reflink(mp))
+ t2 += xfs_calc_buf_res(
+ xfs_refcountbt_block_count(mp, 4),
+ blksz);
+
+ return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3);
+ }
+
+ t4 = xfs_calc_refcountbt_reservation(mp, 2);
+ return XFS_DQUOT_LOGRES(mp) + max(t4, max3(t1, t2, t3));
+}
+
+unsigned int
+xfs_calc_itruncate_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_itruncate_reservation(mp, true);
}
/*
@@ -349,7 +441,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 3),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
XFS_FSB_TO_B(mp, 1))));
}
@@ -389,7 +481,7 @@ xfs_calc_link_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1))));
}
@@ -427,7 +519,7 @@ xfs_calc_remove_reservation(
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -572,7 +664,7 @@ xfs_calc_growdata_reservation(
struct xfs_mount *mp)
{
return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -594,7 +686,7 @@ xfs_calc_growrtalloc_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -670,7 +762,7 @@ xfs_calc_addafork_reservation(
xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
XFS_FSB_TO_B(mp, 1));
}
@@ -693,7 +785,7 @@ xfs_calc_attrinval_reservation(
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
XFS_FSB_TO_B(mp, 1))),
(xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4),
XFS_FSB_TO_B(mp, 1))));
}
@@ -760,7 +852,7 @@ xfs_calc_attrrm_reservation(
XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
(xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2),
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -791,13 +883,21 @@ xfs_calc_qm_setqlim_reservation(void)
*/
STATIC uint
xfs_calc_qm_dqalloc_reservation(
- struct xfs_mount *mp)
+ struct xfs_mount *mp,
+ bool for_minlogsize)
{
- return xfs_calc_write_reservation(mp) +
+ return xfs_calc_write_reservation(mp, for_minlogsize) +
xfs_calc_buf_res(1,
XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
}
+unsigned int
+xfs_calc_qm_dqalloc_reservation_minlogsize(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_qm_dqalloc_reservation(mp, true);
+}
+
/*
* Syncing the incore super block changes to disk.
* the super block to reflect the changes: sector size
@@ -814,36 +914,18 @@ xfs_trans_resv_calc(
struct xfs_mount *mp,
struct xfs_trans_resv *resp)
{
- unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
-
- /*
- * In the early days of rmap+reflink, we always set the rmap maxlevels
- * to 9 even if the AG was small enough that it would never grow to
- * that height. Transaction reservation sizes influence the minimum
- * log size calculation, which influences the size of the log that mkfs
- * creates. Use the old value here to ensure that newly formatted
- * small filesystems will mount on older kernels.
- */
- if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
- mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+ int logcount_adj = 0;
/*
* The following transactions are logged in physical format and
* require a permanent reservation on space.
*/
- resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp, false);
+ resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_itruncate.tr_logcount =
- XFS_ITRUNCATE_LOG_COUNT_REFLINK;
- else
- resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+ resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp, false);
+ resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -899,11 +981,9 @@ xfs_trans_resv_calc(
resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT;
resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
- resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
- if (xfs_has_reflink(mp))
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
- else
- resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+ resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp,
+ false);
+ resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
/*
@@ -930,6 +1010,19 @@ xfs_trans_resv_calc(
resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
- /* Put everything back the way it was. This goes at the end. */
- mp->m_rmap_maxlevels = rmap_maxlevels;
+ /*
+ * Add one logcount for BUI items that appear with rmap or reflink,
+ * one logcount for refcount intent items, and one logcount for rmap
+ * intent items.
+ */
+ if (xfs_has_reflink(mp) || xfs_has_rmapbt(mp))
+ logcount_adj++;
+ if (xfs_has_reflink(mp))
+ logcount_adj++;
+ if (xfs_has_rmapbt(mp))
+ logcount_adj++;
+
+ resp->tr_itruncate.tr_logcount += logcount_adj;
+ resp->tr_write.tr_logcount += logcount_adj;
+ resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index fc4e9b369a3a..0554b9d775d2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -73,7 +73,6 @@ struct xfs_trans_resv {
#define XFS_DEFAULT_LOG_COUNT 1
#define XFS_DEFAULT_PERM_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT 2
-#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
#define XFS_CREATE_TMPFILE_LOG_COUNT 2
@@ -83,13 +82,24 @@ struct xfs_trans_resv {
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
-#define XFS_WRITE_LOG_COUNT_REFLINK 8
#define XFS_ADDAFORK_LOG_COUNT 2
#define XFS_ATTRINVAL_LOG_COUNT 1
#define XFS_ATTRSET_LOG_COUNT 3
#define XFS_ATTRRM_LOG_COUNT 3
+/*
+ * Original log operation counts were overestimated in the early days of
+ * reflink. These are retained here purely for minimum log size calculations
+ * and must not be used for runtime reservations.
+ */
+#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
+#define XFS_WRITE_LOG_COUNT_REFLINK 8
+
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
-uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
+uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+
+unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
+unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b6da06b40989..373f64a492a4 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -12,8 +12,8 @@ typedef uint32_t xfs_agblock_t; /* blockno in alloc. group */
typedef uint32_t xfs_agino_t; /* inode # within allocation grp */
typedef uint32_t xfs_extlen_t; /* extent length in blocks */
typedef uint32_t xfs_agnumber_t; /* allocation group number */
-typedef int32_t xfs_extnum_t; /* # of extents in a file */
-typedef int16_t xfs_aextnum_t; /* # extents in an attribute fork */
+typedef uint64_t xfs_extnum_t; /* # of extents in a file */
+typedef uint32_t xfs_aextnum_t; /* # extents in an attribute fork */
typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
@@ -57,13 +57,6 @@ typedef void * xfs_failaddr_t;
#define NULLAGINO ((xfs_agino_t)-1)
/*
- * Max values for extlen, extnum, aextnum.
- */
-#define MAXEXTLEN ((xfs_extlen_t)0x001fffff) /* 21 bits */
-#define MAXEXTNUM ((xfs_extnum_t)0x7fffffff) /* signed int */
-#define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */
-
-/*
* Minimum and maximum blocksize and sectorsize.
* The blocksize upper limit is pretty much arbitrary.
* The sectorsize upper limit is due to sizeof(sb_sectsize).
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index a4cbbc346f60..285995ba3947 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -133,29 +133,13 @@ xchk_bmap_get_rmap(
if (info->is_shared) {
error = xfs_rmap_lookup_le_range(info->sc->sa.rmap_cur, agbno,
owner, offset, rflags, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
- goto out;
+ } else {
+ error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno,
+ owner, offset, rflags, rmap, &has_rmap);
}
-
- /*
- * Otherwise, use the (faster) regular lookup.
- */
- error = xfs_rmap_lookup_le(info->sc->sa.rmap_cur, agbno, 0, owner,
- offset, rflags, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
+ if (!xchk_should_check_xref(info->sc, &error, &info->sc->sa.rmap_cur))
return false;
- if (!has_rmap)
- goto out;
- error = xfs_rmap_get_rec(info->sc->sa.rmap_cur, rmap, &has_rmap);
- if (!xchk_should_check_xref(info->sc, &error,
- &info->sc->sa.rmap_cur))
- return false;
-
-out:
if (!has_rmap)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
@@ -350,7 +334,7 @@ xchk_bmap_iextent(
irec->br_startoff);
/* Make sure the extent points to a valid place. */
- if (irec->br_blockcount > MAXEXTLEN)
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
if (info->is_rt &&
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index bf1f3607d0b6..97b54ac3075f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -23,6 +23,8 @@
#include "xfs_rmap_btree.h"
#include "xfs_log.h"
#include "xfs_trans_priv.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index eac15af7b08c..51820b40ab1c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -232,7 +232,8 @@ xchk_dinode(
size_t fork_recs;
unsigned long long isize;
uint64_t flags2;
- uint32_t nextents;
+ xfs_extnum_t nextents;
+ xfs_extnum_t naextents;
prid_t prid;
uint16_t flags;
uint16_t mode;
@@ -390,8 +391,10 @@ xchk_dinode(
xchk_inode_extsize(sc, dip, ino, mode, flags);
+ nextents = xfs_dfork_data_extents(dip);
+ naextents = xfs_dfork_attr_extents(dip);
+
/* di_nextents */
- nextents = be32_to_cpu(dip->di_nextents);
fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_format) {
case XFS_DINODE_FMT_EXTENTS:
@@ -411,7 +414,7 @@ xchk_dinode(
/* di_forkoff */
if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
- if (dip->di_anextents != 0 && dip->di_forkoff == 0)
+ if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS)
xchk_ino_set_corrupt(sc, ino);
@@ -423,19 +426,18 @@ xchk_dinode(
xchk_ino_set_corrupt(sc, ino);
/* di_anextents */
- nextents = be16_to_cpu(dip->di_anextents);
fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec);
switch (dip->di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
- if (nextents > fork_recs)
+ if (naextents > fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
case XFS_DINODE_FMT_BTREE:
- if (nextents <= fork_recs)
+ if (naextents <= fork_recs)
xchk_ino_set_corrupt(sc, ino);
break;
default:
- if (nextents != 0)
+ if (naextents != 0)
xchk_ino_set_corrupt(sc, ino);
}
@@ -513,14 +515,14 @@ xchk_inode_xref_bmap(
&nextents, &count);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents < be32_to_cpu(dip->di_nextents))
+ if (nextents < xfs_dfork_data_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
&nextents, &acount);
if (!xchk_should_check_xref(sc, &error, NULL))
return;
- if (nextents != be16_to_cpu(dip->di_anextents))
+ if (nextents != xfs_dfork_attr_extents(dip))
xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino);
/* Check nblocks against the inode. */
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 8fa012057405..0a3bde64c675 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -40,6 +40,7 @@ xchk_setup_rt(
/* Scrub a free extent record from the realtime bitmap. */
STATIC int
xchk_rtbitmap_rec(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
@@ -48,10 +49,10 @@ xchk_rtbitmap_rec(
xfs_rtblock_t startblock;
xfs_rtblock_t blockcount;
- startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
- blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+ startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+ blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
- if (!xfs_verify_rtext(sc->mp, startblock, blockcount))
+ if (!xfs_verify_rtext(mp, startblock, blockcount))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -114,7 +115,7 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(sc->tp, xchk_rtbitmap_rec, sc);
+ error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 5c52ee869272..3df9c1782ead 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -10,12 +10,12 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
#include "xfs_error.h"
#include "xfs_acl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include "xfs_trans.h"
#include <linux/posix_acl_xattr.h>
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index bb6abdcb265d..263404d0bfda 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -16,11 +16,13 @@ extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
void xfs_forget_acl(struct inode *inode, const char *name);
#else
-static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu)
+#define xfs_get_acl NULL
+#define xfs_set_acl NULL
+static inline int __xfs_set_acl(struct inode *inode, struct posix_acl *acl,
+ int type)
{
- return NULL;
+ return 0;
}
-# define xfs_set_acl NULL
static inline void xfs_forget_acl(struct inode *inode, const char *name)
{
}
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
new file mode 100644
index 000000000000..e8ac88d9fd14
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.c
@@ -0,0 +1,824 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_shared.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_attr.h"
+#include "xfs_attr_item.h"
+#include "xfs_trace.h"
+#include "xfs_inode.h"
+#include "xfs_trans_space.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log_priv.h"
+#include "xfs_log_recover.h"
+
+static const struct xfs_item_ops xfs_attri_item_ops;
+static const struct xfs_item_ops xfs_attrd_item_ops;
+static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip);
+
+static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attri_log_item, attri_item);
+}
+
+STATIC void
+xfs_attri_item_free(
+ struct xfs_attri_log_item *attrip)
+{
+ kmem_free(attrip->attri_item.li_lv_shadow);
+ kvfree(attrip);
+}
+
+/*
+ * Freeing the attrip requires that we remove it from the AIL if it has already
+ * been placed there. However, the ATTRI may not yet have been placed in the
+ * AIL when called by xfs_attri_release() from ATTRD processing due to the
+ * ordering of committed vs unpin operations in bulk insert operations. Hence
+ * the reference count to ensure only the last caller frees the ATTRI.
+ */
+STATIC void
+xfs_attri_release(
+ struct xfs_attri_log_item *attrip)
+{
+ ASSERT(atomic_read(&attrip->attri_refcount) > 0);
+ if (!atomic_dec_and_test(&attrip->attri_refcount))
+ return;
+
+ xfs_trans_ail_delete(&attrip->attri_item, 0);
+ xfs_attri_item_free(attrip);
+}
+
+STATIC void
+xfs_attri_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_attri_log_format) +
+ xlog_calc_iovec_len(attrip->attri_name_len);
+
+ if (!attrip->attri_value_len)
+ return;
+
+ *nvecs += 1;
+ *nbytes += xlog_calc_iovec_len(attrip->attri_value_len);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attri log
+ * item. We use 1 iovec for the attri_format_item, 1 for the name, and
+ * another for the value if it is present
+ */
+STATIC void
+xfs_attri_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ attrip->attri_format.alfi_type = XFS_LI_ATTRI;
+ attrip->attri_format.alfi_size = 1;
+
+ /*
+ * This size accounting must be done before copying the attrip into the
+ * iovec. If we do it after, the wrong size will be recorded to the log
+ * and we trip across assertion checks for bad region sizes later during
+ * the log recovery.
+ */
+
+ ASSERT(attrip->attri_name_len > 0);
+ attrip->attri_format.alfi_size++;
+
+ if (attrip->attri_value_len > 0)
+ attrip->attri_format.alfi_size++;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
+ &attrip->attri_format,
+ sizeof(struct xfs_attri_log_format));
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME,
+ attrip->attri_name,
+ attrip->attri_name_len);
+ if (attrip->attri_value_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE,
+ attrip->attri_value,
+ attrip->attri_value_len);
+}
+
+/*
+ * The unpin operation is the last place an ATTRI is manipulated in the log. It
+ * is either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the ATTRI transaction has been successfully committed to make
+ * it this far. Therefore, we expect whoever committed the ATTRI to either
+ * construct and commit the ATTRD or drop the ATTRD's reference in the event of
+ * error. Simply drop the log's ATTRI reference now that the log is done with
+ * it.
+ */
+STATIC void
+xfs_attri_item_unpin(
+ struct xfs_log_item *lip,
+ int remove)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+
+STATIC void
+xfs_attri_item_release(
+ struct xfs_log_item *lip)
+{
+ xfs_attri_release(ATTRI_ITEM(lip));
+}
+
+/*
+ * Allocate and initialize an attri item. Caller may allocate an additional
+ * trailing buffer for name and value
+ */
+STATIC struct xfs_attri_log_item *
+xfs_attri_init(
+ struct xfs_mount *mp,
+ uint32_t name_len,
+ uint32_t value_len)
+
+{
+ struct xfs_attri_log_item *attrip;
+ uint32_t buffer_size = name_len + value_len;
+
+ if (buffer_size) {
+ /*
+ * This could be over 64kB in length, so we have to use
+ * kvmalloc() for this. But kvmalloc() utterly sucks, so we
+ * use own version.
+ */
+ attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) +
+ buffer_size);
+ } else {
+ attrip = kmem_cache_alloc(xfs_attri_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ }
+ memset(attrip, 0, sizeof(struct xfs_attri_log_item));
+
+ attrip->attri_name_len = name_len;
+ if (name_len)
+ attrip->attri_name = ((char *)attrip) +
+ sizeof(struct xfs_attri_log_item);
+ else
+ attrip->attri_name = NULL;
+
+ attrip->attri_value_len = value_len;
+ if (value_len)
+ attrip->attri_value = ((char *)attrip) +
+ sizeof(struct xfs_attri_log_item) +
+ name_len;
+ else
+ attrip->attri_value = NULL;
+
+ xfs_log_item_init(mp, &attrip->attri_item, XFS_LI_ATTRI,
+ &xfs_attri_item_ops);
+ attrip->attri_format.alfi_id = (uintptr_t)(void *)attrip;
+ atomic_set(&attrip->attri_refcount, 2);
+
+ return attrip;
+}
+
+/*
+ * Copy an attr format buffer from the given buf, and into the destination attr
+ * format structure.
+ */
+STATIC int
+xfs_attri_copy_format(
+ struct xfs_log_iovec *buf,
+ struct xfs_attri_log_format *dst_attr_fmt)
+{
+ struct xfs_attri_log_format *src_attr_fmt = buf->i_addr;
+ size_t len;
+
+ len = sizeof(struct xfs_attri_log_format);
+ if (buf->i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+ }
+
+ memcpy((char *)dst_attr_fmt, (char *)src_attr_fmt, len);
+ return 0;
+}
+
+static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
+{
+ return container_of(lip, struct xfs_attrd_log_item, attrd_item);
+}
+
+STATIC void
+xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
+{
+ kmem_free(attrdp->attrd_item.li_lv_shadow);
+ kmem_free(attrdp);
+}
+
+STATIC void
+xfs_attrd_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
+{
+ *nvecs += 1;
+ *nbytes += sizeof(struct xfs_attrd_log_format);
+}
+
+/*
+ * This is called to fill in the log iovecs for the given attrd log item. We use
+ * only 1 iovec for the attrd_format, and we point that at the attr_log_format
+ * structure embedded in the attrd item.
+ */
+STATIC void
+xfs_attrd_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
+
+ attrdp->attrd_format.alfd_type = XFS_LI_ATTRD;
+ attrdp->attrd_format.alfd_size = 1;
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRD_FORMAT,
+ &attrdp->attrd_format,
+ sizeof(struct xfs_attrd_log_format));
+}
+
+/*
+ * The ATTRD is either committed or aborted if the transaction is canceled. If
+ * the transaction is canceled, drop our reference to the ATTRI and free the
+ * ATTRD.
+ */
+STATIC void
+xfs_attrd_item_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_attrd_log_item *attrdp = ATTRD_ITEM(lip);
+
+ xfs_attri_release(attrdp->attrd_attrip);
+ xfs_attrd_item_free(attrdp);
+}
+
+static struct xfs_log_item *
+xfs_attrd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
+}
+
+/*
+ * Performs one step of an attribute update intent and marks the attrd item
+ * dirty.. An attr operation may be a set or a remove. Note that the
+ * transaction is marked dirty regardless of whether the operation succeeds or
+ * fails to support the ATTRI/ATTRD lifecycle rules.
+ */
+STATIC int
+xfs_xattri_finish_update(
+ struct xfs_attr_item *attr,
+ struct xfs_attrd_log_item *attrdp)
+{
+ struct xfs_da_args *args = attr->xattri_da_args;
+ int error;
+
+ if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ error = -EIO;
+ goto out;
+ }
+
+ error = xfs_attr_set_iter(attr);
+ if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+ error = -EAGAIN;
+out:
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the ATTRI and frees the ATTRD
+ * 2.) shuts down the filesystem
+ */
+ args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
+
+ /*
+ * attr intent/done items are null when logged attributes are disabled
+ */
+ if (attrdp)
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ return error;
+}
+
+/* Log an attr to the intent item. */
+STATIC void
+xfs_attr_log_item(
+ struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip,
+ struct xfs_attr_item *attr)
+{
+ struct xfs_attri_log_format *attrp;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
+
+ /*
+ * At this point the xfs_attr_item has been constructed, and we've
+ * created the log intent. Fill in the attri log item and log format
+ * structure with fields from this xfs_attr_item
+ */
+ attrp = &attrip->attri_format;
+ attrp->alfi_ino = attr->xattri_da_args->dp->i_ino;
+ attrp->alfi_op_flags = attr->xattri_op_flags;
+ attrp->alfi_value_len = attr->xattri_da_args->valuelen;
+ attrp->alfi_name_len = attr->xattri_da_args->namelen;
+ attrp->alfi_attr_flags = attr->xattri_da_args->attr_filter;
+
+ memcpy(attrip->attri_name, attr->xattri_da_args->name,
+ attr->xattri_da_args->namelen);
+ memcpy(attrip->attri_value, attr->xattri_da_args->value,
+ attr->xattri_da_args->valuelen);
+ attrip->attri_name_len = attr->xattri_da_args->namelen;
+ attrip->attri_value_len = attr->xattri_da_args->valuelen;
+}
+
+/* Get an ATTRI. */
+static struct xfs_log_item *
+xfs_attr_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attr_item *attr;
+
+ ASSERT(count == 1);
+
+ if (!xfs_sb_version_haslogxattrs(&mp->m_sb))
+ return NULL;
+
+ /*
+ * Each attr item only performs one attribute operation at a time, so
+ * this is a list of one
+ */
+ list_for_each_entry(attr, items, xattri_list) {
+ attrip = xfs_attri_init(mp, attr->xattri_da_args->namelen,
+ attr->xattri_da_args->valuelen);
+ if (attrip == NULL)
+ return NULL;
+
+ xfs_trans_add_item(tp, &attrip->attri_item);
+ xfs_attr_log_item(tp, attrip, attr);
+ }
+
+ return &attrip->attri_item;
+}
+
+/* Process an attr. */
+STATIC int
+xfs_attr_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ struct xfs_attr_item *attr;
+ struct xfs_attrd_log_item *done_item = NULL;
+ int error;
+
+ attr = container_of(item, struct xfs_attr_item, xattri_list);
+ if (done)
+ done_item = ATTRD_ITEM(done);
+
+ /*
+ * Always reset trans after EAGAIN cycle
+ * since the transaction is new
+ */
+ attr->xattri_da_args->trans = tp;
+
+ error = xfs_xattri_finish_update(attr, done_item);
+ if (error != -EAGAIN)
+ kmem_free(attr);
+
+ return error;
+}
+
+/* Abort all pending ATTRs. */
+STATIC void
+xfs_attr_abort_intent(
+ struct xfs_log_item *intent)
+{
+ xfs_attri_release(ATTRI_ITEM(intent));
+}
+
+/* Cancel an attr */
+STATIC void
+xfs_attr_cancel_item(
+ struct list_head *item)
+{
+ struct xfs_attr_item *attr;
+
+ attr = container_of(item, struct xfs_attr_item, xattri_list);
+ kmem_free(attr);
+}
+
+STATIC xfs_lsn_t
+xfs_attri_item_committed(
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+
+ /*
+ * The attrip refers to xfs_attr_item memory to log the name and value
+ * with the intent item. This already occurred when the intent was
+ * committed so these fields are no longer accessed. Clear them out of
+ * caution since we're about to free the xfs_attr_item.
+ */
+ attrip->attri_name = NULL;
+ attrip->attri_value = NULL;
+
+ /*
+ * The ATTRI is logged only once and cannot be moved in the log, so
+ * simply return the lsn at which it's been logged.
+ */
+ return lsn;
+}
+
+STATIC bool
+xfs_attri_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return ATTRI_ITEM(lip)->attri_format.alfi_id == intent_id;
+}
+
+/* Is this recovered ATTRI format ok? */
+static inline bool
+xfs_attri_validate(
+ struct xfs_mount *mp,
+ struct xfs_attri_log_format *attrp)
+{
+ unsigned int op = attrp->alfi_op_flags &
+ XFS_ATTR_OP_FLAGS_TYPE_MASK;
+
+ if (attrp->__pad != 0)
+ return false;
+
+ /* alfi_op_flags should be either a set or remove */
+ switch (op) {
+ case XFS_ATTR_OP_FLAGS_SET:
+ case XFS_ATTR_OP_FLAGS_REPLACE:
+ case XFS_ATTR_OP_FLAGS_REMOVE:
+ break;
+ default:
+ return false;
+ }
+
+ if (attrp->alfi_value_len > XATTR_SIZE_MAX)
+ return false;
+
+ if ((attrp->alfi_name_len > XATTR_NAME_MAX) ||
+ (attrp->alfi_name_len == 0))
+ return false;
+
+ return xfs_verify_ino(mp, attrp->alfi_ino);
+}
+
+/*
+ * Process an attr intent item that was recovered from the log. We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attri_item_recover(
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
+{
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attr_item *attr;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_inode *ip;
+ struct xfs_da_args *args;
+ struct xfs_trans *tp;
+ struct xfs_trans_res tres;
+ struct xfs_attri_log_format *attrp;
+ int error, ret = 0;
+ int total;
+ int local;
+ struct xfs_attrd_log_item *done_item = NULL;
+
+ /*
+ * First check the validity of the attr described by the ATTRI. If any
+ * are bad, then assume that all are bad and just toss the ATTRI.
+ */
+ attrp = &attrip->attri_format;
+ if (!xfs_attri_validate(mp, attrp) ||
+ !xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len))
+ return -EFSCORRUPTED;
+
+ error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ if (error)
+ return error;
+
+ attr = kmem_zalloc(sizeof(struct xfs_attr_item) +
+ sizeof(struct xfs_da_args), KM_NOFS);
+ args = (struct xfs_da_args *)(attr + 1);
+
+ attr->xattri_da_args = args;
+ attr->xattri_op_flags = attrp->alfi_op_flags;
+
+ args->dp = ip;
+ args->geo = mp->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->name = attrip->attri_name;
+ args->namelen = attrp->alfi_name_len;
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
+ args->attr_filter = attrp->alfi_attr_flags;
+ args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT;
+
+ switch (attrp->alfi_op_flags & XFS_ATTR_OP_FLAGS_TYPE_MASK) {
+ case XFS_ATTR_OP_FLAGS_SET:
+ case XFS_ATTR_OP_FLAGS_REPLACE:
+ args->value = attrip->attri_value;
+ args->valuelen = attrp->alfi_value_len;
+ args->total = xfs_attr_calc_size(args, &local);
+ if (xfs_inode_hasattr(args->dp))
+ attr->xattri_dela_state = xfs_attr_init_replace_state(args);
+ else
+ attr->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTR_OP_FLAGS_REMOVE:
+ if (!xfs_inode_hasattr(args->dp))
+ goto out;
+ attr->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ xfs_init_attr_trans(args, &tres, &total);
+ error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ goto out;
+
+ args->trans = tp;
+ done_item = xfs_trans_get_attrd(tp, attrip);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ ret = xfs_xattri_finish_update(attr, done_item);
+ if (ret == -EAGAIN) {
+ /* There's more work to do, so add it to this transaction */
+ xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
+ } else
+ error = ret;
+
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+ }
+
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+out_unlock:
+ if (attr->xattri_leaf_bp)
+ xfs_buf_relse(attr->xattri_leaf_bp);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+out:
+ if (ret != -EAGAIN)
+ kmem_free(attr);
+ return error;
+}
+
+/* Re-log an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_attri_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_attrd_log_item *attrdp;
+ struct xfs_attri_log_item *old_attrip;
+ struct xfs_attri_log_item *new_attrip;
+ struct xfs_attri_log_format *new_attrp;
+ struct xfs_attri_log_format *old_attrp;
+
+ old_attrip = ATTRI_ITEM(intent);
+ old_attrp = &old_attrip->attri_format;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ attrdp = xfs_trans_get_attrd(tp, old_attrip);
+ set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
+
+ new_attrip = xfs_attri_init(tp->t_mountp, old_attrp->alfi_name_len,
+ old_attrp->alfi_value_len);
+ new_attrp = &new_attrip->attri_format;
+
+ new_attrp->alfi_ino = old_attrp->alfi_ino;
+ new_attrp->alfi_op_flags = old_attrp->alfi_op_flags;
+ new_attrp->alfi_value_len = old_attrp->alfi_value_len;
+ new_attrp->alfi_name_len = old_attrp->alfi_name_len;
+ new_attrp->alfi_attr_flags = old_attrp->alfi_attr_flags;
+
+ memcpy(new_attrip->attri_name, old_attrip->attri_name,
+ new_attrip->attri_name_len);
+
+ if (new_attrip->attri_value_len > 0)
+ memcpy(new_attrip->attri_value, old_attrip->attri_value,
+ new_attrip->attri_value_len);
+
+ xfs_trans_add_item(tp, &new_attrip->attri_item);
+ set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
+
+ return &new_attrip->attri_item;
+}
+
+STATIC int
+xlog_recover_attri_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ int error;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attri_log_format *attri_formatp;
+ int region = 0;
+
+ attri_formatp = item->ri_buf[region].i_addr;
+
+ /* Validate xfs_attri_log_format */
+ if (!xfs_attri_validate(mp, attri_formatp)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ return -EFSCORRUPTED;
+ }
+
+ /* memory alloc failure will cause replay to abort */
+ attrip = xfs_attri_init(mp, attri_formatp->alfi_name_len,
+ attri_formatp->alfi_value_len);
+ if (attrip == NULL)
+ return -ENOMEM;
+
+ error = xfs_attri_copy_format(&item->ri_buf[region],
+ &attrip->attri_format);
+ if (error)
+ goto out;
+
+ region++;
+ memcpy(attrip->attri_name, item->ri_buf[region].i_addr,
+ attrip->attri_name_len);
+
+ if (!xfs_attr_namecheck(attrip->attri_name, attrip->attri_name_len)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (attrip->attri_value_len > 0) {
+ region++;
+ memcpy(attrip->attri_value, item->ri_buf[region].i_addr,
+ attrip->attri_value_len);
+ }
+
+ /*
+ * The ATTRI has two references. One for the ATTRD and one for ATTRI to
+ * ensure it makes it into the AIL. Insert the ATTRI into the AIL
+ * directly and drop the ATTRI reference. Note that
+ * xfs_trans_ail_update() drops the AIL lock.
+ */
+ xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
+ xfs_attri_release(attrip);
+ return 0;
+out:
+ xfs_attri_item_free(attrip);
+ return error;
+}
+
+/*
+ * This routine is called to allocate an "attr free done" log item.
+ */
+static struct xfs_attrd_log_item *
+xfs_trans_get_attrd(struct xfs_trans *tp,
+ struct xfs_attri_log_item *attrip)
+{
+ struct xfs_attrd_log_item *attrdp;
+
+ ASSERT(tp != NULL);
+
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+ &xfs_attrd_item_ops);
+ attrdp->attrd_attrip = attrip;
+ attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+ xfs_trans_add_item(tp, &attrdp->attrd_item);
+ return attrdp;
+}
+
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ if (!intent)
+ return NULL;
+
+ return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+ .max_items = 1,
+ .create_intent = xfs_attr_create_intent,
+ .abort_intent = xfs_attr_abort_intent,
+ .create_done = xfs_attr_create_done,
+ .finish_item = xfs_attr_finish_item,
+ .cancel_item = xfs_attr_cancel_item,
+};
+
+/*
+ * This routine is called when an ATTRD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding ATTRI if
+ * it was still in the log. To do this it searches the AIL for the ATTRI with
+ * an id equal to that in the ATTRD format structure. If we find it we drop
+ * the ATTRD reference, which removes the ATTRI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_attrd_commit_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
+{
+ struct xfs_attrd_log_format *attrd_formatp;
+
+ attrd_formatp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
+ return -EFSCORRUPTED;
+ }
+
+ xlog_recover_release_intent(log, XFS_LI_ATTRI,
+ attrd_formatp->alfd_alf_id);
+ return 0;
+}
+
+static const struct xfs_item_ops xfs_attri_item_ops = {
+ .flags = XFS_ITEM_INTENT,
+ .iop_size = xfs_attri_item_size,
+ .iop_format = xfs_attri_item_format,
+ .iop_unpin = xfs_attri_item_unpin,
+ .iop_committed = xfs_attri_item_committed,
+ .iop_release = xfs_attri_item_release,
+ .iop_recover = xfs_attri_item_recover,
+ .iop_match = xfs_attri_item_match,
+ .iop_relog = xfs_attri_item_relog,
+};
+
+const struct xlog_recover_item_ops xlog_attri_item_ops = {
+ .item_type = XFS_LI_ATTRI,
+ .commit_pass2 = xlog_recover_attri_commit_pass2,
+};
+
+static const struct xfs_item_ops xfs_attrd_item_ops = {
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
+ .iop_size = xfs_attrd_item_size,
+ .iop_format = xfs_attrd_item_format,
+ .iop_release = xfs_attrd_item_release,
+ .iop_intent = xfs_attrd_item_intent,
+};
+
+const struct xlog_recover_item_ops xlog_attrd_item_ops = {
+ .item_type = XFS_LI_ATTRD,
+ .commit_pass2 = xlog_recover_attrd_commit_pass2,
+};
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
new file mode 100644
index 000000000000..c3b779f82adb
--- /dev/null
+++ b/fs/xfs/xfs_attr_item.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * Copyright (C) 2022 Oracle. All Rights Reserved.
+ * Author: Allison Henderson <allison.henderson@oracle.com>
+ */
+#ifndef __XFS_ATTR_ITEM_H__
+#define __XFS_ATTR_ITEM_H__
+
+/* kernel only ATTRI/ATTRD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * This is the "attr intention" log item. It is used to log the fact that some
+ * extended attribute operations need to be processed. An operation is
+ * currently either a set or remove. Set or remove operations are described by
+ * the xfs_attr_item which may be logged to this intent.
+ *
+ * During a normal attr operation, name and value point to the name and value
+ * fields of the caller's xfs_da_args structure. During a recovery, the name
+ * and value buffers are copied from the log, and stored in a trailing buffer
+ * attached to the xfs_attr_item until they are committed. They are freed when
+ * the xfs_attr_item itself is freed when the work is done.
+ */
+struct xfs_attri_log_item {
+ struct xfs_log_item attri_item;
+ atomic_t attri_refcount;
+ int attri_name_len;
+ int attri_value_len;
+ void *attri_name;
+ void *attri_value;
+ struct xfs_attri_log_format attri_format;
+};
+
+/*
+ * This is the "attr done" log item. It is used to log the fact that some attrs
+ * earlier mentioned in an attri item have been freed.
+ */
+struct xfs_attrd_log_item {
+ struct xfs_log_item attrd_item;
+ struct xfs_attri_log_item *attrd_attrip;
+ struct xfs_attrd_log_format attrd_format;
+};
+
+#endif /* __XFS_ATTR_ITEM_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 2d1e5134cebe..90a14e85e76d 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -15,6 +15,7 @@
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_sf.h"
#include "xfs_attr_leaf.h"
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 761dde155099..51f66e982484 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -39,6 +39,7 @@ STATIC void
xfs_bui_item_free(
struct xfs_bui_log_item *buip)
{
+ kmem_free(buip->bui_item.li_lv_shadow);
kmem_cache_free(xfs_bui_cache, buip);
}
@@ -54,10 +55,11 @@ xfs_bui_release(
struct xfs_bui_log_item *buip)
{
ASSERT(atomic_read(&buip->bui_refcount) > 0);
- if (atomic_dec_and_test(&buip->bui_refcount)) {
- xfs_trans_ail_delete(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_bui_item_free(buip);
- }
+ if (!atomic_dec_and_test(&buip->bui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&buip->bui_item, 0);
+ xfs_bui_item_free(buip);
}
@@ -198,14 +200,24 @@ xfs_bud_item_release(
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
xfs_bui_release(budp->bud_buip);
+ kmem_free(budp->bud_item.li_lv_shadow);
kmem_cache_free(xfs_bud_cache, budp);
}
+static struct xfs_log_item *
+xfs_bud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &BUD_ITEM(lip)->bud_buip->bui_item;
+}
+
static const struct xfs_item_ops xfs_bud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_bud_item_size,
.iop_format = xfs_bud_item_format,
.iop_release = xfs_bud_item_release,
+ .iop_intent = xfs_bud_item_intent,
};
static struct xfs_bud_log_item *
@@ -254,7 +266,7 @@ xfs_trans_log_finish_bmap_update(
* 1.) releases the BUI and frees the BUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
return error;
@@ -506,6 +518,8 @@ xfs_bui_item_recover(
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
error = xfs_iext_count_may_overflow(ip, whichfork, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
@@ -584,6 +598,7 @@ xfs_bui_item_relog(
}
static const struct xfs_item_ops xfs_bui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
.iop_unpin = xfs_bui_item_unpin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index eb2e387ba528..52be58372c63 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -119,14 +119,14 @@ retry:
*/
ralen = ap->length / mp->m_sb.sb_rextsize;
/*
- * If the old value was close enough to MAXEXTLEN that
+ * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
* we rounded up to it, cut it back so it's valid again.
* Note that if it's a really large request (bigger than
- * MAXEXTLEN), we don't hear about that number, and can't
+ * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
* adjust the starting point to match it.
*/
- if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
- ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
+ if (ralen * mp->m_sb.sb_rextsize >= XFS_MAX_BMBT_EXTLEN)
+ ralen = XFS_MAX_BMBT_EXTLEN / mp->m_sb.sb_rextsize;
/*
* Lock out modifications to both the RT bitmap and summary inodes
@@ -839,9 +839,11 @@ xfs_alloc_file_space(
* count, hence we need to limit the number of blocks we are
* trying to reserve to avoid an overflow. We can't allocate
* more than @nimaps extents, and an extent is limited on disk
- * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+ * to XFS_BMBT_MAX_EXTLEN (21 bits), so use that to enforce the
+ * limit.
*/
- resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
+ resblks = min_t(xfs_fileoff_t, (e - s),
+ (XFS_MAX_BMBT_EXTLEN * nimaps));
if (unlikely(rt)) {
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
rblocks = resblks;
@@ -857,6 +859,9 @@ xfs_alloc_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto error;
@@ -912,6 +917,8 @@ xfs_unmap_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1193,6 +1200,8 @@ xfs_insert_file_space(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_PUNCH_HOLE_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_PUNCH_HOLE_CNT);
if (error)
goto out_trans_cancel;
@@ -1421,6 +1430,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(ip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
@@ -1429,6 +1441,9 @@ xfs_swap_extent_rmap(
error = xfs_iext_count_may_overflow(tip,
XFS_DATA_FORK,
XFS_IEXT_SWAP_RMAP_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_SWAP_RMAP_CNT);
if (error)
goto out;
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index e11e9ef2338f..4d8a6aece995 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -8,15 +8,18 @@
/* kernel only definitions */
+struct xfs_buf;
+struct xfs_mount;
+
/* buf log item flags */
-#define XFS_BLI_HOLD 0x01
-#define XFS_BLI_DIRTY 0x02
-#define XFS_BLI_STALE 0x04
-#define XFS_BLI_LOGGED 0x08
-#define XFS_BLI_INODE_ALLOC_BUF 0x10
-#define XFS_BLI_STALE_INODE 0x20
-#define XFS_BLI_INODE_BUF 0x40
-#define XFS_BLI_ORDERED 0x80
+#define XFS_BLI_HOLD (1u << 0)
+#define XFS_BLI_DIRTY (1u << 1)
+#define XFS_BLI_STALE (1u << 2)
+#define XFS_BLI_LOGGED (1u << 3)
+#define XFS_BLI_INODE_ALLOC_BUF (1u << 4)
+#define XFS_BLI_STALE_INODE (1u << 5)
+#define XFS_BLI_INODE_BUF (1u << 6)
+#define XFS_BLI_ORDERED (1u << 7)
#define XFS_BLI_FLAGS \
{ XFS_BLI_HOLD, "HOLD" }, \
@@ -28,11 +31,6 @@
{ XFS_BLI_INODE_BUF, "INODE_BUF" }, \
{ XFS_BLI_ORDERED, "ORDERED" }
-
-struct xfs_buf;
-struct xfs_mount;
-struct xfs_buf_log_item;
-
/*
* This is the in core log item structure used to track information
* needed to log buffers. It tracks how many times the lock has been
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 5afedcbc78c7..5a6c3c3c4de2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -136,10 +136,7 @@ xfs_qm_adjust_res_timer(
res->timer = xfs_dquot_set_timeout(mp,
ktime_get_real_seconds() + qlim->time);
} else {
- if (res->timer == 0)
- res->warnings = 0;
- else
- res->timer = 0;
+ res->timer = 0;
}
}
@@ -322,6 +319,9 @@ xfs_dquot_disk_alloc(
error = xfs_iext_count_may_overflow(quotip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, quotip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto err_cancel;
@@ -589,10 +589,6 @@ xfs_dquot_from_disk(
dqp->q_ino.count = be64_to_cpu(ddqp->d_icount);
dqp->q_rtb.count = be64_to_cpu(ddqp->d_rtbcount);
- dqp->q_blk.warnings = be16_to_cpu(ddqp->d_bwarns);
- dqp->q_ino.warnings = be16_to_cpu(ddqp->d_iwarns);
- dqp->q_rtb.warnings = be16_to_cpu(ddqp->d_rtbwarns);
-
dqp->q_blk.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_btimer);
dqp->q_ino.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_itimer);
dqp->q_rtb.timer = xfs_dquot_from_disk_ts(ddqp, ddqp->d_rtbtimer);
@@ -634,9 +630,9 @@ xfs_dquot_to_disk(
ddqp->d_icount = cpu_to_be64(dqp->q_ino.count);
ddqp->d_rtbcount = cpu_to_be64(dqp->q_rtb.count);
- ddqp->d_bwarns = cpu_to_be16(dqp->q_blk.warnings);
- ddqp->d_iwarns = cpu_to_be16(dqp->q_ino.warnings);
- ddqp->d_rtbwarns = cpu_to_be16(dqp->q_rtb.warnings);
+ ddqp->d_bwarns = 0;
+ ddqp->d_iwarns = 0;
+ ddqp->d_rtbwarns = 0;
ddqp->d_btimer = xfs_dquot_to_disk_ts(dqp, dqp->q_blk.timer);
ddqp->d_itimer = xfs_dquot_to_disk_ts(dqp, dqp->q_ino.timer);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 6b5e3cf40c8b..80c8f851a2f3 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -44,14 +44,6 @@ struct xfs_dquot_res {
* in seconds since the Unix epoch.
*/
time64_t timer;
-
- /*
- * For root dquots, this is the maximum number of warnings that will
- * be issued for this quota type. Otherwise, this is the number of
- * warnings issued against this quota. Note that none of this is
- * implemented.
- */
- xfs_qwarncnt_t warnings;
};
static inline bool
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 749fd18c4f32..296faa41d81d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -57,6 +57,9 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_REDUCE_MAX_IEXTENTS,
XFS_RANDOM_BMAP_ALLOC_MINLEN_EXTENT,
XFS_RANDOM_AG_RESV_FAIL,
+ XFS_RANDOM_LARP,
+ XFS_RANDOM_DA_LEAF_SPLIT,
+ XFS_RANDOM_ATTR_LEAF_TO_NODE,
};
struct xfs_errortag_attr {
@@ -170,6 +173,9 @@ XFS_ERRORTAG_ATTR_RW(buf_ioerror, XFS_ERRTAG_BUF_IOERROR);
XFS_ERRORTAG_ATTR_RW(reduce_max_iextents, XFS_ERRTAG_REDUCE_MAX_IEXTENTS);
XFS_ERRORTAG_ATTR_RW(bmap_alloc_minlen_extent, XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT);
XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
+XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
+XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
+XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -211,6 +217,9 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(reduce_max_iextents),
XFS_ERRORTAG_ATTR_LIST(bmap_alloc_minlen_extent),
XFS_ERRORTAG_ATTR_LIST(ag_resv_fail),
+ XFS_ERRORTAG_ATTR_LIST(larp),
+ XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
+ XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 5735d5ea87ee..5191e9145e55 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -64,16 +64,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
* XFS panic tags -- allow a call to xfs_alert_tag() be turned into
* a panic by setting xfs_panic_mask in a sysctl.
*/
-#define XFS_NO_PTAG 0
-#define XFS_PTAG_IFLUSH 0x00000001
-#define XFS_PTAG_LOGRES 0x00000002
-#define XFS_PTAG_AILDELETE 0x00000004
-#define XFS_PTAG_ERROR_REPORT 0x00000008
-#define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010
-#define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020
-#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
-#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
-#define XFS_PTAG_VERIFIER_ERROR 0x00000100
+#define XFS_NO_PTAG 0u
+#define XFS_PTAG_IFLUSH (1u << 0)
+#define XFS_PTAG_LOGRES (1u << 1)
+#define XFS_PTAG_AILDELETE (1u << 2)
+#define XFS_PTAG_ERROR_REPORT (1u << 3)
+#define XFS_PTAG_SHUTDOWN_CORRUPT (1u << 4)
+#define XFS_PTAG_SHUTDOWN_IOERROR (1u << 5)
+#define XFS_PTAG_SHUTDOWN_LOGERROR (1u << 6)
+#define XFS_PTAG_FSBLOCK_ZERO (1u << 7)
+#define XFS_PTAG_VERIFIER_ERROR (1u << 8)
#define XFS_PTAG_STRINGS \
{ XFS_NO_PTAG, "none" }, \
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 0e50f2c9348e..765be054dffe 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -58,10 +58,11 @@ xfs_efi_release(
struct xfs_efi_log_item *efip)
{
ASSERT(atomic_read(&efip->efi_refcount) > 0);
- if (atomic_dec_and_test(&efip->efi_refcount)) {
- xfs_trans_ail_delete(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_efi_item_free(efip);
- }
+ if (!atomic_dec_and_test(&efip->efi_refcount))
+ return;
+
+ xfs_trans_ail_delete(&efip->efi_item, 0);
+ xfs_efi_item_free(efip);
}
/*
@@ -306,11 +307,20 @@ xfs_efd_item_release(
xfs_efd_item_free(efdp);
}
+static struct xfs_log_item *
+xfs_efd_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &EFD_ITEM(lip)->efd_efip->efi_item;
+}
+
static const struct xfs_item_ops xfs_efd_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_efd_item_size,
.iop_format = xfs_efd_item_format,
.iop_release = xfs_efd_item_release,
+ .iop_intent = xfs_efd_item_intent,
};
/*
@@ -380,7 +390,7 @@ xfs_trans_free_extent(
* 1.) releases the EFI and frees the EFD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
next_extent = efdp->efd_next_extent;
@@ -688,6 +698,7 @@ xfs_efi_item_relog(
}
static const struct xfs_item_ops xfs_efi_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_unpin = xfs_efi_item_unpin,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 85c412107a10..a60632ecc3f0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -310,7 +310,7 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- int *iolock)
+ unsigned int *iolock)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -513,7 +513,7 @@ xfs_file_dio_write_aligned(
struct kiocb *iocb,
struct iov_iter *from)
{
- int iolock = XFS_IOLOCK_SHARED;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
ret = xfs_ilock_iocb(iocb, iolock);
@@ -566,7 +566,7 @@ xfs_file_dio_write_unaligned(
{
size_t isize = i_size_read(VFS_I(ip));
size_t count = iov_iter_count(from);
- int iolock = XFS_IOLOCK_SHARED;
+ unsigned int iolock = XFS_IOLOCK_SHARED;
unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
ssize_t ret;
@@ -655,7 +655,7 @@ xfs_file_dax_write(
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
- int iolock = XFS_IOLOCK_EXCL;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
ssize_t ret, error = 0;
loff_t pos;
@@ -694,13 +694,11 @@ xfs_file_buffered_write(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
bool cleared_space = false;
- int iolock;
+ unsigned int iolock;
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
@@ -767,9 +765,7 @@ xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
size_t ocount = iov_iter_count(from);
@@ -1167,12 +1163,10 @@ xfs_file_open(
struct inode *inode,
struct file *file)
{
- if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
- return -EFBIG;
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
- return 0;
+ return generic_file_open(inode, file);
}
STATIC int
@@ -1181,7 +1175,7 @@ xfs_dir_open(
struct file *file)
{
struct xfs_inode *ip = XFS_I(inode);
- int mode;
+ unsigned int mode;
int error;
error = xfs_file_open(inode, file);
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 6a3ce0f6dc9e..be9bcf8a1f99 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -128,11 +128,12 @@ xfs_filestream_pick_ag(
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
if (err) {
- xfs_perag_put(pag);
- if (err != -EAGAIN)
+ if (err != -EAGAIN) {
+ xfs_perag_put(pag);
return err;
+ }
/* Couldn't lock the AGF, skip this AG. */
- continue;
+ goto next_ag;
}
}
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 10e1cb71439e..bb23199f65c3 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -450,11 +450,11 @@ xfs_getfsmap_logdev(
/* Transform a rtbitmap "record" into a fsmap */
STATIC int
xfs_getfsmap_rtdev_rtbitmap_helper(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv)
{
- struct xfs_mount *mp = tp->t_mountp;
struct xfs_getfsmap_info *info = priv;
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
@@ -535,7 +535,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
do_div(alow.ar_startext, mp->m_sb.sb_rextsize);
if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize))
ahigh.ar_startext++;
- error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+ error = xfs_rtalloc_query_range(mp, tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
goto err;
@@ -547,7 +547,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
info->last = true;
ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext);
- error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+ error = xfs_getfsmap_rtdev_rtbitmap_helper(mp, tp, &ahigh, info);
if (error)
goto err;
err:
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 68f74549fa22..888839e75d11 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -349,10 +349,7 @@ xfs_fs_counts(
cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
xfs_fdblocks_unavailable(mp);
-
- spin_lock(&mp->m_sb_lock);
- cnt->freertx = mp->m_sb.sb_frextents;
- spin_unlock(&mp->m_sb_lock);
+ cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
}
/*
@@ -512,7 +509,7 @@ xfs_fs_goingdown(
void
xfs_do_force_shutdown(
struct xfs_mount *mp,
- int flags,
+ uint32_t flags,
char *fname,
int lnnum)
{
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f62fa652c2fd..4d0a98f920ca 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -41,5 +41,6 @@ struct xfs_globals xfs_globals = {
#endif
#ifdef DEBUG
.pwork_threads = -1, /* automatic thread detection */
+ .larp = false, /* log attribute replay */
#endif
};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index bffd6eb0b298..5269354b1b69 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1916,13 +1916,16 @@ xfs_inodegc_want_queue_rt_file(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
- uint64_t freertx;
if (!XFS_IS_REALTIME_INODE(ip))
return false;
- freertx = READ_ONCE(mp->m_sb.sb_frextents);
- return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
+ if (__percpu_counter_compare(&mp->m_frextents,
+ mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
+ XFS_FDBLOCKS_BATCH) < 0)
+ return true;
+
+ return false;
}
#else
# define xfs_inodegc_want_queue_rt_file(ip) (false)
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 508e184e3b8f..b05314d48176 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,6 +63,7 @@ STATIC void
xfs_icreate_item_release(
struct xfs_log_item *lip)
{
+ kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 39ae53efb3ab..b2879870a17e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -416,10 +416,12 @@ xfs_lockdep_subclass_ok(
* parent locking. Care must be taken to ensure we don't overrun the subclass
* storage fields in the class mask we build.
*/
-static inline int
-xfs_lock_inumorder(int lock_mode, int subclass)
+static inline uint
+xfs_lock_inumorder(
+ uint lock_mode,
+ uint subclass)
{
- int class = 0;
+ uint class = 0;
ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP |
XFS_ILOCK_RTSUM)));
@@ -464,7 +466,10 @@ xfs_lock_inodes(
int inodes,
uint lock_mode)
{
- int attempts = 0, i, j, try_lock;
+ int attempts = 0;
+ uint i;
+ int j;
+ bool try_lock;
struct xfs_log_item *lp;
/*
@@ -489,9 +494,9 @@ xfs_lock_inodes(
} else if (lock_mode & XFS_MMAPLOCK_EXCL)
ASSERT(!(lock_mode & XFS_ILOCK_EXCL));
- try_lock = 0;
- i = 0;
again:
+ try_lock = false;
+ i = 0;
for (; i < inodes; i++) {
ASSERT(ips[i]);
@@ -506,7 +511,7 @@ again:
for (j = (i - 1); j >= 0 && !try_lock; j--) {
lp = &ips[j]->i_itemp->ili_item;
if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
- try_lock++;
+ try_lock = true;
}
}
@@ -546,8 +551,6 @@ again:
if ((attempts % 5) == 0) {
delay(1); /* Don't just spin the CPU */
}
- i = 0;
- try_lock = 0;
goto again;
}
}
@@ -1024,11 +1027,6 @@ xfs_create(
xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
unlock_dp_on_error = true;
- error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
-
/*
* A newly created regular or special file just has one directory
* entry pointing to them, but a directory also the "." entry
@@ -1242,11 +1240,6 @@ xfs_link(
if (error)
goto std_return;
- error = xfs_iext_count_may_overflow(tdp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto error_return;
-
/*
* If we are using project inheritance, we only allow hard link
* creation in our tree when the project IDs are the same; else
@@ -3212,35 +3205,6 @@ retry:
/*
* Check for expected errors before we dirty the transaction
* so we can return an error without a transaction abort.
- *
- * Extent count overflow check:
- *
- * From the perspective of src_dp, a rename operation is essentially a
- * directory entry remove operation. Hence the only place where we check
- * for extent count overflow for src_dp is in
- * xfs_bmap_del_extent_real(). xfs_bmap_del_extent_real() returns
- * -ENOSPC when it detects a possible extent count overflow and in
- * response, the higher layers of directory handling code do the
- * following:
- * 1. Data/Free blocks: XFS lets these blocks linger until a
- * future remove operation removes them.
- * 2. Dabtree blocks: XFS swaps the blocks with the last block in the
- * Leaf space and unmaps the last block.
- *
- * For target_dp, there are two cases depending on whether the
- * destination directory entry exists or not.
- *
- * When destination directory entry does not exist (i.e. target_ip ==
- * NULL), extent count overflow check is performed only when transaction
- * has a non-zero sized space reservation associated with it. With a
- * zero-sized space reservation, XFS allows a rename operation to
- * continue only when the directory has sufficient free space in its
- * data/leaf/free space blocks to hold the new entry.
- *
- * When destination directory entry exists (i.e. target_ip != NULL), all
- * we need to do is change the inode number associated with the already
- * existing entry. Hence there is no need to perform an extent count
- * overflow check.
*/
if (target_ip == NULL) {
/*
@@ -3251,12 +3215,6 @@ retry:
error = xfs_dir_canenter(tp, target_dp, target_name);
if (error)
goto out_trans_cancel;
- } else {
- error = xfs_iext_count_may_overflow(target_dp,
- XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
}
} else {
/*
@@ -3424,18 +3382,12 @@ retry:
* inode number of the whiteout inode rather than removing it
* altogether.
*/
- if (wip) {
+ if (wip)
error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
spaceres);
- } else {
- /*
- * NOTE: We don't need to check for extent count overflow here
- * because the dir remove name code will leave the dir block in
- * place if the extent count would overflow.
- */
+ else
error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
spaceres);
- }
if (error)
goto out_trans_cancel;
@@ -3517,8 +3469,8 @@ xfs_iflush(
if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp) >
ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) {
xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
- "%s: detected corrupt incore inode %Lu, "
- "total extents = %d, nblocks = %Ld, ptr "PTR_FMT,
+ "%s: detected corrupt incore inode %llu, "
+ "total extents = %llu nblocks = %lld, ptr "PTR_FMT,
__func__, ip->i_ino,
ip->i_df.if_nextents + xfs_ifork_nextents(ip->i_afp),
ip->i_nblocks, ip);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 740ab13d1aa2..7be6f8e705ab 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -218,6 +218,11 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
return ip->i_diflags2 & XFS_DIFLAG2_BIGTIME;
}
+static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip)
+{
+ return ip->i_diflags2 & XFS_DIFLAG2_NREXT64;
+}
+
/*
* Return the buftarg used for data allocations on a given inode.
*/
@@ -278,12 +283,12 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
* 1<<16 - 1<<32-1 -- lockdep annotation (integers)
*/
-#define XFS_IOLOCK_EXCL (1<<0)
-#define XFS_IOLOCK_SHARED (1<<1)
-#define XFS_ILOCK_EXCL (1<<2)
-#define XFS_ILOCK_SHARED (1<<3)
-#define XFS_MMAPLOCK_EXCL (1<<4)
-#define XFS_MMAPLOCK_SHARED (1<<5)
+#define XFS_IOLOCK_EXCL (1u << 0)
+#define XFS_IOLOCK_SHARED (1u << 1)
+#define XFS_ILOCK_EXCL (1u << 2)
+#define XFS_ILOCK_SHARED (1u << 3)
+#define XFS_MMAPLOCK_EXCL (1u << 4)
+#define XFS_MMAPLOCK_SHARED (1u << 5)
#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
| XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
@@ -350,19 +355,19 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip)
*/
#define XFS_IOLOCK_SHIFT 16
#define XFS_IOLOCK_MAX_SUBCLASS 3
-#define XFS_IOLOCK_DEP_MASK 0x000f0000
+#define XFS_IOLOCK_DEP_MASK 0x000f0000u
#define XFS_MMAPLOCK_SHIFT 20
#define XFS_MMAPLOCK_NUMORDER 0
#define XFS_MMAPLOCK_MAX_SUBCLASS 3
-#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
+#define XFS_MMAPLOCK_DEP_MASK 0x00f00000u
#define XFS_ILOCK_SHIFT 24
-#define XFS_ILOCK_PARENT_VAL 5
+#define XFS_ILOCK_PARENT_VAL 5u
#define XFS_ILOCK_MAX_SUBCLASS (XFS_ILOCK_PARENT_VAL - 1)
-#define XFS_ILOCK_RTBITMAP_VAL 6
-#define XFS_ILOCK_RTSUM_VAL 7
-#define XFS_ILOCK_DEP_MASK 0xff000000
+#define XFS_ILOCK_RTBITMAP_VAL 6u
+#define XFS_ILOCK_RTSUM_VAL 7u
+#define XFS_ILOCK_DEP_MASK 0xff000000u
#define XFS_ILOCK_PARENT (XFS_ILOCK_PARENT_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTBITMAP (XFS_ILOCK_RTBITMAP_VAL << XFS_ILOCK_SHIFT)
#define XFS_ILOCK_RTSUM (XFS_ILOCK_RTSUM_VAL << XFS_ILOCK_SHIFT)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9e6ef55cf29e..721def0639fd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -71,7 +71,7 @@ xfs_inode_item_data_fork_size(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- *nbytes += roundup(ip->i_df.if_bytes, 4);
+ *nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes);
*nvecs += 1;
}
break;
@@ -112,7 +112,7 @@ xfs_inode_item_attr_fork_size(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
- *nbytes += roundup(ip->i_afp->if_bytes, 4);
+ *nbytes += xlog_calc_iovec_len(ip->i_afp->if_bytes);
*nvecs += 1;
}
break;
@@ -204,17 +204,12 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_df.if_bytes, 4);
ASSERT(ip->i_df.if_u1.if_data != NULL);
ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data, data_bytes);
- ilf->ilf_dsize = (unsigned)data_bytes;
+ ip->i_df.if_u1.if_data,
+ ip->i_df.if_bytes);
+ ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DDATA;
@@ -288,17 +283,11 @@ xfs_inode_item_format_attr_fork(
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
- /*
- * Round i_bytes up to a word boundary.
- * The underlying memory is guaranteed
- * to be there by xfs_idata_realloc().
- */
- data_bytes = roundup(ip->i_afp->if_bytes, 4);
ASSERT(ip->i_afp->if_u1.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
ip->i_afp->if_u1.if_data,
- data_bytes);
- ilf->ilf_asize = (unsigned)data_bytes;
+ ip->i_afp->if_bytes);
+ ilf->ilf_asize = (unsigned)ip->i_afp->if_bytes;
ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
@@ -359,6 +348,21 @@ xfs_copy_dm_fields_to_log_dinode(
}
}
+static inline void
+xfs_inode_to_log_dinode_iext_counters(
+ struct xfs_inode *ip,
+ struct xfs_log_dinode *to)
+{
+ if (xfs_inode_has_large_extent_counts(ip)) {
+ to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_big_anextents = xfs_ifork_nextents(ip->i_afp);
+ to->di_nrext64_pad = 0;
+ } else {
+ to->di_nextents = xfs_ifork_nextents(&ip->i_df);
+ to->di_anextents = xfs_ifork_nextents(ip->i_afp);
+ }
+}
+
static void
xfs_inode_to_log_dinode(
struct xfs_inode *ip,
@@ -374,7 +378,6 @@ xfs_inode_to_log_dinode(
to->di_projid_lo = ip->i_projid & 0xffff;
to->di_projid_hi = ip->i_projid >> 16;
- memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
@@ -386,8 +389,6 @@ xfs_inode_to_log_dinode(
to->di_size = ip->i_disk_size;
to->di_nblocks = ip->i_nblocks;
to->di_extsize = ip->i_extsize;
- to->di_nextents = xfs_ifork_nextents(&ip->i_df);
- to->di_anextents = xfs_ifork_nextents(ip->i_afp);
to->di_forkoff = ip->i_forkoff;
to->di_aformat = xfs_ifork_format(ip->i_afp);
to->di_flags = ip->i_diflags;
@@ -407,11 +408,14 @@ xfs_inode_to_log_dinode(
to->di_lsn = lsn;
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_inode_to_log_dinode_iext_counters(ip, to);
}
/*
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 239dd2e3384e..d28ffaebd067 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -142,6 +142,29 @@ xfs_log_dinode_to_disk_ts(
return ts;
}
+static inline bool xfs_log_dinode_has_large_extent_counts(
+ const struct xfs_log_dinode *ld)
+{
+ return ld->di_version >= 3 &&
+ (ld->di_flags2 & XFS_DIFLAG2_NREXT64);
+}
+
+static inline void
+xfs_log_dinode_to_disk_iext_counters(
+ struct xfs_log_dinode *from,
+ struct xfs_dinode *to)
+{
+ if (xfs_log_dinode_has_large_extent_counts(from)) {
+ to->di_big_nextents = cpu_to_be64(from->di_big_nextents);
+ to->di_big_anextents = cpu_to_be32(from->di_big_anextents);
+ to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad);
+ } else {
+ to->di_nextents = cpu_to_be32(from->di_nextents);
+ to->di_anextents = cpu_to_be16(from->di_anextents);
+ }
+
+}
+
STATIC void
xfs_log_dinode_to_disk(
struct xfs_log_dinode *from,
@@ -158,7 +181,6 @@ xfs_log_dinode_to_disk(
to->di_nlink = cpu_to_be32(from->di_nlink);
to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
- memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
@@ -167,8 +189,6 @@ xfs_log_dinode_to_disk(
to->di_size = cpu_to_be64(from->di_size);
to->di_nblocks = cpu_to_be64(from->di_nblocks);
to->di_extsize = cpu_to_be32(from->di_extsize);
- to->di_nextents = cpu_to_be32(from->di_nextents);
- to->di_anextents = cpu_to_be16(from->di_anextents);
to->di_forkoff = from->di_forkoff;
to->di_aformat = from->di_aformat;
to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
@@ -184,12 +204,66 @@ xfs_log_dinode_to_disk(
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);
- memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
+ memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
- to->di_flushiter = 0;
+ to->di_v3_pad = 0;
} else {
to->di_flushiter = cpu_to_be16(from->di_flushiter);
+ memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
}
+
+ xfs_log_dinode_to_disk_iext_counters(from, to);
+}
+
+STATIC int
+xlog_dinode_verify_extent_counts(
+ struct xfs_mount *mp,
+ struct xfs_log_dinode *ldip)
+{
+ xfs_extnum_t nextents;
+ xfs_aextnum_t anextents;
+
+ if (xfs_log_dinode_has_large_extent_counts(ldip)) {
+ if (!xfs_has_large_extent_counts(mp) ||
+ (ldip->di_nrext64_pad != 0)) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode large extent count format",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, padding 0x%x",
+ ldip->di_ino, xfs_has_large_extent_counts(mp),
+ ldip->di_nrext64_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_big_nextents;
+ anextents = ldip->di_big_anextents;
+ } else {
+ if (ldip->di_version == 3 && ldip->di_v3_pad != 0) {
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode di_v3_pad",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, di_v3_pad 0x%llx",
+ ldip->di_ino, ldip->di_v3_pad);
+ return -EFSCORRUPTED;
+ }
+
+ nextents = ldip->di_nextents;
+ anextents = ldip->di_anextents;
+ }
+
+ if (unlikely(nextents + anextents > ldip->di_nblocks)) {
+ XFS_CORRUPTION_ERROR("Bad log dinode extent counts",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
+ xfs_alert(mp,
+ "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx",
+ ldip->di_ino, xfs_has_large_extent_counts(mp), nextents,
+ anextents, ldip->di_nblocks);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
STATIC int
@@ -317,13 +391,12 @@ xlog_recover_inode_commit_pass2(
if (unlikely(S_ISREG(ldip->di_mode))) {
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for regular file",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad regular inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -331,49 +404,37 @@ xlog_recover_inode_commit_pass2(
if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
(ldip->di_format != XFS_DINODE_FMT_BTREE) &&
(ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR(
+ "Bad log dinode data fork format for directory",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad dir inode log record, rec ptr "PTR_FMT", "
- "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
- __func__, item, dip, bp, in_f->ilf_ino);
+ "Bad inode 0x%llx, data fork format 0x%x",
+ in_f->ilf_ino, ldip->di_format);
error = -EFSCORRUPTED;
goto out_release;
}
}
- if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
- xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
- __func__, item, dip, bp, in_f->ilf_ino,
- ldip->di_nextents + ldip->di_anextents,
- ldip->di_nblocks);
- error = -EFSCORRUPTED;
+
+ error = xlog_dinode_verify_extent_counts(mp, ldip);
+ if (error)
goto out_release;
- }
+
if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR("Bad log dinode fork offset",
+ XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
- "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
- item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
+ "Bad inode 0x%llx, di_forkoff 0x%x",
+ in_f->ilf_ino, ldip->di_forkoff);
error = -EFSCORRUPTED;
goto out_release;
}
isize = xfs_log_dinode_size(mp);
if (unlikely(item->ri_buf[1].i_len > isize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
- XFS_ERRLEVEL_LOW, mp, ldip,
- sizeof(*ldip));
+ XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
+ mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "%s: Bad inode log record length %d, rec ptr "PTR_FMT,
- __func__, item->ri_buf[1].i_len, item);
+ "Bad inode 0x%llx log dinode size 0x%x",
+ in_f->ilf_ino, item->ri_buf[1].i_len);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -401,7 +462,7 @@ xlog_recover_inode_commit_pass2(
ASSERT(in_f->ilf_size <= 4);
ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
ASSERT(!(fields & XFS_ILOG_DFORK) ||
- (len == in_f->ilf_dsize));
+ (len == xlog_calc_iovec_len(in_f->ilf_dsize)));
switch (fields & XFS_ILOG_DFORK) {
case XFS_ILOG_DDATA:
@@ -436,7 +497,7 @@ xlog_recover_inode_commit_pass2(
}
len = item->ri_buf[attr_index].i_len;
src = item->ri_buf[attr_index].i_addr;
- ASSERT(len == in_f->ilf_asize);
+ ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
case XFS_ILOG_ADATA:
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 83481005317a..0e5cb7936206 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -15,6 +15,8 @@
#include "xfs_iwalk.h"
#include "xfs_itable.h"
#include "xfs_error.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -35,8 +37,6 @@
#include "xfs_health.h"
#include "xfs_reflink.h"
#include "xfs_ioctl.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
#include <linux/mount.h>
#include <linux/namei.h>
@@ -813,6 +813,9 @@ xfs_bulk_ireq_setup(
if (XFS_INO_TO_AGNO(mp, breq->startino) >= mp->m_sb.sb_agcount)
return -ECANCELED;
+ if (hdr->flags & XFS_BULK_IREQ_NREXT64)
+ breq->flags |= XFS_IBULK_NREXT64;
+
return 0;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ca25ed89b706..2f54b701eead 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -17,6 +17,8 @@
#include "xfs_itable.h"
#include "xfs_fsops.h"
#include "xfs_rtalloc.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_ioctl.h"
#include "xfs_ioctl32.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index e552ce541ec2..5a393259a3a3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -251,6 +251,8 @@ xfs_iomap_write_direct(
return error;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, nr_exts);
if (error)
goto out_trans_cancel;
@@ -402,7 +404,7 @@ xfs_iomap_prealloc_size(
*/
plen = prev.br_blockcount;
while (xfs_iext_prev_extent(ifp, &ncur, &got)) {
- if (plen > MAXEXTLEN / 2 ||
+ if (plen > XFS_MAX_BMBT_EXTLEN / 2 ||
isnullstartblock(got.br_startblock) ||
got.br_startoff + got.br_blockcount != prev.br_startoff ||
got.br_startblock + got.br_blockcount != prev.br_startblock)
@@ -414,23 +416,23 @@ xfs_iomap_prealloc_size(
/*
* If the size of the extents is greater than half the maximum extent
* length, then use the current offset as the basis. This ensures that
- * for large files the preallocation size always extends to MAXEXTLEN
- * rather than falling short due to things like stripe unit/width
- * alignment of real extents.
+ * for large files the preallocation size always extends to
+ * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe
+ * unit/width alignment of real extents.
*/
alloc_blocks = plen * 2;
- if (alloc_blocks > MAXEXTLEN)
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
alloc_blocks = XFS_B_TO_FSB(mp, offset);
qblocks = alloc_blocks;
/*
- * MAXEXTLEN is not a power of two value but we round the prealloc down
- * to the nearest power of two value after throttling. To prevent the
- * round down from unconditionally reducing the maximum supported
- * prealloc size, we round up first, apply appropriate throttling,
- * round down and cap the value to MAXEXTLEN.
+ * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc
+ * down to the nearest power of two value after throttling. To prevent
+ * the round down from unconditionally reducing the maximum supported
+ * prealloc size, we round up first, apply appropriate throttling, round
+ * down and cap the value to XFS_BMBT_MAX_EXTLEN.
*/
- alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
+ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN),
alloc_blocks);
freesp = percpu_counter_read_positive(&mp->m_fdblocks);
@@ -478,14 +480,14 @@ xfs_iomap_prealloc_size(
*/
if (alloc_blocks)
alloc_blocks = rounddown_pow_of_two(alloc_blocks);
- if (alloc_blocks > MAXEXTLEN)
- alloc_blocks = MAXEXTLEN;
+ if (alloc_blocks > XFS_MAX_BMBT_EXTLEN)
+ alloc_blocks = XFS_MAX_BMBT_EXTLEN;
/*
* If we are still trying to allocate more space than is
* available, squash the prealloc hard. This can happen if we
* have a large file on a small filesystem and the above
- * lowspace thresholds are smaller than MAXEXTLEN.
+ * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN.
*/
while (alloc_blocks && alloc_blocks >= freesp)
alloc_blocks >>= 4;
@@ -555,6 +557,9 @@ xfs_iomap_write_unwritten(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b34e8e4344a8..e912b7fee714 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -13,6 +13,8 @@
#include "xfs_inode.h"
#include "xfs_acl.h"
#include "xfs_quota.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_trans.h"
#include "xfs_trace.h"
@@ -209,7 +211,6 @@ xfs_generic_create(
if (unlikely(error))
goto out_cleanup_inode;
-#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
error = __xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
@@ -220,7 +221,6 @@ xfs_generic_create(
if (error)
goto out_cleanup_inode;
}
-#endif
xfs_setup_iops(ip);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index c08c79d9e311..f74c9fff72bb 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -64,6 +64,7 @@ xfs_bulkstat_one_int(
struct xfs_inode *ip; /* incore inode pointer */
struct inode *inode;
struct xfs_bulkstat *buf = bc->buf;
+ xfs_extnum_t nextents;
int error = -EINVAL;
if (xfs_internal_inum(mp, ino))
@@ -102,7 +103,13 @@ xfs_bulkstat_one_int(
buf->bs_xflags = xfs_ip2xflags(ip);
buf->bs_extsize_blks = ip->i_extsize;
- buf->bs_extents = xfs_ifork_nextents(&ip->i_df);
+
+ nextents = xfs_ifork_nextents(&ip->i_df);
+ if (!(bc->breq->flags & XFS_IBULK_NREXT64))
+ buf->bs_extents = min(nextents, XFS_MAX_EXTCNT_DATA_FORK_SMALL);
+ else
+ buf->bs_extents64 = nextents;
+
xfs_bulkstat_health(ip, buf);
buf->bs_aextents = xfs_ifork_nextents(ip->i_afp);
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
@@ -256,6 +263,7 @@ xfs_bulkstat(
.breq = breq,
};
struct xfs_trans *tp;
+ unsigned int iwalk_flags = 0;
int error;
if (breq->mnt_userns != &init_user_ns) {
@@ -279,7 +287,10 @@ xfs_bulkstat(
if (error)
goto out;
- error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags,
+ if (breq->flags & XFS_IBULK_SAME_AG)
+ iwalk_flags |= XFS_IWALK_SAME_AG;
+
+ error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
out:
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 7078d10c9b12..e2d0eba43f35 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -17,7 +17,10 @@ struct xfs_ibulk {
};
/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG (XFS_IWALK_SAME_AG)
+#define XFS_IBULK_SAME_AG (1U << 0)
+
+/* Fill out the bs_extents64 field if set. */
+#define XFS_IBULK_NREXT64 (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
diff --git a/fs/xfs/xfs_iwalk.h b/fs/xfs/xfs_iwalk.h
index 37a795f03267..83699089755e 100644
--- a/fs/xfs/xfs_iwalk.h
+++ b/fs/xfs/xfs_iwalk.h
@@ -26,7 +26,7 @@ int xfs_iwalk_threaded(struct xfs_mount *mp, xfs_ino_t startino,
unsigned int inode_records, bool poll, void *data);
/* Only iterate inodes within the same AG as @startino. */
-#define XFS_IWALK_SAME_AG (0x1)
+#define XFS_IWALK_SAME_AG (1U << 0)
#define XFS_IWALK_FLAGS_ALL (XFS_IWALK_SAME_AG)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 499e15b24215..9dc748abdf33 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -49,7 +49,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclog,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp);
STATIC void
xlog_grant_push_ail(
@@ -61,10 +60,6 @@ xlog_sync(
struct xlog_in_core *iclog);
#if defined(DEBUG)
STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr);
-STATIC void
xlog_verify_grant_tail(
struct xlog *log);
STATIC void
@@ -77,7 +72,6 @@ xlog_verify_tail_lsn(
struct xlog *log,
struct xlog_in_core *iclog);
#else
-#define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b)
@@ -90,6 +84,62 @@ xlog_iclogs_empty(
static int
xfs_log_cover(struct xfs_mount *);
+/*
+ * We need to make sure the buffer pointer returned is naturally aligned for the
+ * biggest basic data type we put into it. We have already accounted for this
+ * padding when sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ *
+ * We also add space for the xlog_op_header that describes this region in the
+ * log. This prepends the data region we return to the caller to copy their data
+ * into, so do all the static initialisation of the ophdr now. Because the ophdr
+ * is not 8 byte aligned, we have to be careful to ensure that we align the
+ * start of the buffer such that the region we return to the call is 8 byte
+ * aligned and packed against the tail of the ophdr.
+ */
+void *
+xlog_prepare_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ uint type)
+{
+ struct xfs_log_iovec *vec = *vecp;
+ struct xlog_op_header *oph;
+ uint32_t len;
+ void *buf;
+
+ if (vec) {
+ ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+ vec++;
+ } else {
+ vec = &lv->lv_iovecp[0];
+ }
+
+ len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ if (!IS_ALIGNED(len, sizeof(uint64_t))) {
+ lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ sizeof(struct xlog_op_header);
+ }
+
+ vec->i_type = type;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+ oph = vec->i_addr;
+ oph->oh_clientid = XFS_TRANSACTION;
+ oph->oh_res2 = 0;
+ oph->oh_flags = 0;
+
+ buf = vec->i_addr + sizeof(struct xlog_op_header);
+ ASSERT(IS_ALIGNED((unsigned long)buf, sizeof(uint64_t)));
+
+ *vecp = vec;
+ return buf;
+}
+
static void
xlog_grant_sub_space(
struct xlog *log,
@@ -322,30 +372,6 @@ xlog_grant_head_check(
return error;
}
-static void
-xlog_tic_reset_res(xlog_ticket_t *tic)
-{
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- tic->t_res_num_ophdrs = 0;
-}
-
-static void
-xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
-{
- if (tic->t_res_num == XLOG_TIC_LEN_MAX) {
- /* add to overflow and start again */
- tic->t_res_o_flow += tic->t_res_arr_sum;
- tic->t_res_num = 0;
- tic->t_res_arr_sum = 0;
- }
-
- tic->t_res_arr[tic->t_res_num].r_len = len;
- tic->t_res_arr[tic->t_res_num].r_type = type;
- tic->t_res_arr_sum += len;
- tic->t_res_num++;
-}
-
bool
xfs_log_writable(
struct xfs_mount *mp)
@@ -395,8 +421,6 @@ xfs_log_regrant(
xlog_grant_push_ail(log, tic->t_unit_res);
tic->t_curr_res = tic->t_unit_res;
- xlog_tic_reset_res(tic);
-
if (tic->t_cnt > 0)
return 0;
@@ -434,10 +458,9 @@ out_error:
int
xfs_log_reserve(
struct xfs_mount *mp,
- int unit_bytes,
- int cnt,
+ int unit_bytes,
+ int cnt,
struct xlog_ticket **ticp,
- uint8_t client,
bool permanent)
{
struct xlog *log = mp->m_log;
@@ -445,15 +468,13 @@ xfs_log_reserve(
int need_bytes;
int error = 0;
- ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
-
if (xlog_is_shutdown(log))
return -EIO;
XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
- tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent);
+ tic = xlog_ticket_alloc(log, unit_bytes, cnt, permanent);
*ticp = tic;
xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
@@ -901,12 +922,22 @@ xlog_write_unmount_record(
struct xlog *log,
struct xlog_ticket *ticket)
{
- struct xfs_unmount_log_format ulf = {
- .magic = XLOG_UNMOUNT_TYPE,
+ struct {
+ struct xlog_op_header ophdr;
+ struct xfs_unmount_log_format ulf;
+ } unmount_rec = {
+ .ophdr = {
+ .oh_clientid = XFS_LOG,
+ .oh_tid = cpu_to_be32(ticket->t_tid),
+ .oh_flags = XLOG_UNMOUNT_TRANS,
+ },
+ .ulf = {
+ .magic = XLOG_UNMOUNT_TYPE,
+ },
};
struct xfs_log_iovec reg = {
- .i_addr = &ulf,
- .i_len = sizeof(ulf),
+ .i_addr = &unmount_rec,
+ .i_len = sizeof(unmount_rec),
.i_type = XLOG_REG_TYPE_UNMOUNT,
};
struct xfs_log_vec vec = {
@@ -914,10 +945,14 @@ xlog_write_unmount_record(
.lv_iovecp = &reg,
};
+ BUILD_BUG_ON((sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_unmount_log_format)) !=
+ sizeof(unmount_rec));
+
/* account for space used by record data */
- ticket->t_curr_res -= sizeof(ulf);
+ ticket->t_curr_res -= sizeof(unmount_rec);
- return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS);
+ return xlog_write(log, NULL, &vec, ticket, reg.i_len);
}
/*
@@ -933,7 +968,7 @@ xlog_unmount_write(
struct xlog_ticket *tic = NULL;
int error;
- error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+ error = xfs_log_reserve(mp, 600, 1, &tic, 0);
if (error)
goto out_err;
@@ -1584,9 +1619,6 @@ xlog_alloc_log(
GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!iclog->ic_data)
goto out_free_iclog;
-#ifdef DEBUG
- log->l_iclog_bak[i] = &iclog->ic_header;
-#endif
head = &iclog->ic_header;
memset(head, 0, sizeof(xlog_rec_header_t));
head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1602,7 +1634,7 @@ xlog_alloc_log(
iclog->ic_log = log;
atomic_set(&iclog->ic_refcnt, 0);
INIT_LIST_HEAD(&iclog->ic_callbacks);
- iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
+ iclog->ic_datap = (void *)iclog->ic_data + log->l_iclog_hsize;
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -2111,63 +2143,11 @@ xlog_print_tic_res(
struct xfs_mount *mp,
struct xlog_ticket *ticket)
{
- uint i;
- uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
-
- /* match with XLOG_REG_TYPE_* in xfs_log.h */
-#define REG_TYPE_STR(type, str) [XLOG_REG_TYPE_##type] = str
- static char *res_type_str[] = {
- REG_TYPE_STR(BFORMAT, "bformat"),
- REG_TYPE_STR(BCHUNK, "bchunk"),
- REG_TYPE_STR(EFI_FORMAT, "efi_format"),
- REG_TYPE_STR(EFD_FORMAT, "efd_format"),
- REG_TYPE_STR(IFORMAT, "iformat"),
- REG_TYPE_STR(ICORE, "icore"),
- REG_TYPE_STR(IEXT, "iext"),
- REG_TYPE_STR(IBROOT, "ibroot"),
- REG_TYPE_STR(ILOCAL, "ilocal"),
- REG_TYPE_STR(IATTR_EXT, "iattr_ext"),
- REG_TYPE_STR(IATTR_BROOT, "iattr_broot"),
- REG_TYPE_STR(IATTR_LOCAL, "iattr_local"),
- REG_TYPE_STR(QFORMAT, "qformat"),
- REG_TYPE_STR(DQUOT, "dquot"),
- REG_TYPE_STR(QUOTAOFF, "quotaoff"),
- REG_TYPE_STR(LRHEADER, "LR header"),
- REG_TYPE_STR(UNMOUNT, "unmount"),
- REG_TYPE_STR(COMMIT, "commit"),
- REG_TYPE_STR(TRANSHDR, "trans header"),
- REG_TYPE_STR(ICREATE, "inode create"),
- REG_TYPE_STR(RUI_FORMAT, "rui_format"),
- REG_TYPE_STR(RUD_FORMAT, "rud_format"),
- REG_TYPE_STR(CUI_FORMAT, "cui_format"),
- REG_TYPE_STR(CUD_FORMAT, "cud_format"),
- REG_TYPE_STR(BUI_FORMAT, "bui_format"),
- REG_TYPE_STR(BUD_FORMAT, "bud_format"),
- };
- BUILD_BUG_ON(ARRAY_SIZE(res_type_str) != XLOG_REG_TYPE_MAX + 1);
-#undef REG_TYPE_STR
-
xfs_warn(mp, "ticket reservation summary:");
- xfs_warn(mp, " unit res = %d bytes",
- ticket->t_unit_res);
- xfs_warn(mp, " current res = %d bytes",
- ticket->t_curr_res);
- xfs_warn(mp, " total reg = %u bytes (o/flow = %u bytes)",
- ticket->t_res_arr_sum, ticket->t_res_o_flow);
- xfs_warn(mp, " ophdrs = %u (ophdr space = %u bytes)",
- ticket->t_res_num_ophdrs, ophdr_spc);
- xfs_warn(mp, " ophdr + reg = %u bytes",
- ticket->t_res_arr_sum + ticket->t_res_o_flow + ophdr_spc);
- xfs_warn(mp, " num regions = %u",
- ticket->t_res_num);
-
- for (i = 0; i < ticket->t_res_num; i++) {
- uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes", i,
- ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
- "bad-rtype" : res_type_str[r_type]),
- ticket->t_res_arr[i].r_len);
- }
+ xfs_warn(mp, " unit res = %d bytes", ticket->t_unit_res);
+ xfs_warn(mp, " current res = %d bytes", ticket->t_curr_res);
+ xfs_warn(mp, " original count = %d", ticket->t_ocnt);
+ xfs_warn(mp, " remaining count = %d", ticket->t_cnt);
}
/*
@@ -2220,187 +2200,226 @@ xlog_print_trans(
}
}
+static inline void
+xlog_write_iovec(
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ void *data,
+ uint32_t write_len,
+ int *bytes_left,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ ASSERT(*log_offset < iclog->ic_log->l_iclog_size);
+ ASSERT(*log_offset % sizeof(int32_t) == 0);
+ ASSERT(write_len % sizeof(int32_t) == 0);
+
+ memcpy(iclog->ic_datap + *log_offset, data, write_len);
+ *log_offset += write_len;
+ *bytes_left -= write_len;
+ (*record_cnt)++;
+ *data_cnt += write_len;
+}
+
/*
- * Calculate the potential space needed by the log vector. We may need a start
- * record, and each region gets its own struct xlog_op_header and may need to be
- * double word aligned.
+ * Write log vectors into a single iclog which is guaranteed by the caller
+ * to have enough space to write the entire log vector into.
*/
-static int
-xlog_write_calc_vec_length(
+static void
+xlog_write_full(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xfs_log_vec *log_vector,
- uint optype)
+ struct xlog_in_core *iclog,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- struct xfs_log_vec *lv;
- int headers = 0;
- int len = 0;
- int i;
-
- if (optype & XLOG_START_TRANS)
- headers++;
-
- for (lv = log_vector; lv; lv = lv->lv_next) {
- /* we don't write ordered log vectors */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED)
- continue;
+ int index;
- headers += lv->lv_niovecs;
+ ASSERT(*log_offset + *len <= iclog->ic_size ||
+ iclog->ic_state == XLOG_STATE_WANT_SYNC);
- for (i = 0; i < lv->lv_niovecs; i++) {
- struct xfs_log_iovec *vecp = &lv->lv_iovecp[i];
+ /*
+ * Ordered log vectors have no regions to write so this
+ * loop will naturally skip them.
+ */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ struct xlog_op_header *ophdr = reg->i_addr;
- len += vecp->i_len;
- xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type);
- }
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ reg->i_len, len, record_cnt, data_cnt);
}
-
- ticket->t_res_num_ophdrs += headers;
- len += headers * sizeof(struct xlog_op_header);
-
- return len;
-}
-
-static void
-xlog_write_start_rec(
- struct xlog_op_header *ophdr,
- struct xlog_ticket *ticket)
-{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_len = 0;
- ophdr->oh_flags = XLOG_START_TRANS;
- ophdr->oh_res2 = 0;
}
-static xlog_op_header_t *
-xlog_write_setup_ophdr(
- struct xlog *log,
- struct xlog_op_header *ophdr,
+static int
+xlog_write_get_more_iclog_space(
struct xlog_ticket *ticket,
- uint flags)
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
{
- ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
- ophdr->oh_clientid = ticket->t_clientid;
- ophdr->oh_res2 = 0;
-
- /* are we copying a commit or unmount record? */
- ophdr->oh_flags = flags;
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog *log = iclog->ic_log;
+ int error;
- /*
- * We've seen logs corrupted with bad transaction client ids. This
- * makes sure that XFS doesn't generate them on. Turn this into an EIO
- * and shut down the filesystem.
- */
- switch (ophdr->oh_clientid) {
- case XFS_TRANSACTION:
- case XFS_VOLUME:
- case XFS_LOG:
- break;
- default:
- xfs_warn(log->l_mp,
- "Bad XFS transaction clientid 0x%x in ticket "PTR_FMT,
- ophdr->oh_clientid, ticket);
- return NULL;
- }
+ spin_lock(&log->l_icloglock);
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
+ xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
+ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
- return ophdr;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ log_offset);
+ if (error)
+ return error;
+ *record_cnt = 0;
+ *data_cnt = 0;
+ *iclogp = iclog;
+ return 0;
}
/*
- * Set up the parameters of the region copy into the log. This has
- * to handle region write split across multiple log buffers - this
- * state is kept external to this function so that this code can
- * be written in an obvious, self documenting manner.
+ * Write log vectors into a single iclog which is smaller than the current chain
+ * length. We write until we cannot fit a full record into the remaining space
+ * and then stop. We return the log vector that is to be written that cannot
+ * wholly fit in the iclog.
*/
static int
-xlog_write_setup_copy(
+xlog_write_partial(
+ struct xfs_log_vec *lv,
struct xlog_ticket *ticket,
- struct xlog_op_header *ophdr,
- int space_available,
- int space_required,
- int *copy_off,
- int *copy_len,
- int *last_was_partial_copy,
- int *bytes_consumed)
-{
- int still_to_copy;
-
- still_to_copy = space_required - *bytes_consumed;
- *copy_off = *bytes_consumed;
-
- if (still_to_copy <= space_available) {
- /* write of region completes here */
- *copy_len = still_to_copy;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- if (*last_was_partial_copy)
- ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS);
- *last_was_partial_copy = 0;
- *bytes_consumed = 0;
- return 0;
- }
+ struct xlog_in_core **iclogp,
+ uint32_t *log_offset,
+ uint32_t *len,
+ uint32_t *record_cnt,
+ uint32_t *data_cnt)
+{
+ struct xlog_in_core *iclog = *iclogp;
+ struct xlog_op_header *ophdr;
+ int index = 0;
+ uint32_t rlen;
+ int error;
- /* partial write of region, needs extra log op header reservation */
- *copy_len = space_available;
- ophdr->oh_len = cpu_to_be32(*copy_len);
- ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (*last_was_partial_copy)
- ophdr->oh_flags |= XLOG_WAS_CONT_TRANS;
- *bytes_consumed += *copy_len;
- (*last_was_partial_copy)++;
+ /* walk the logvec, copying until we run out of space in the iclog */
+ for (index = 0; index < lv->lv_niovecs; index++) {
+ struct xfs_log_iovec *reg = &lv->lv_iovecp[index];
+ uint32_t reg_offset = 0;
- /* account for new log op header */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
- ticket->t_res_num_ophdrs++;
+ /*
+ * The first region of a continuation must have a non-zero
+ * length otherwise log recovery will just skip over it and
+ * start recovering from the next opheader it finds. Because we
+ * mark the next opheader as a continuation, recovery will then
+ * incorrectly add the continuation to the previous region and
+ * that breaks stuff.
+ *
+ * Hence if there isn't space for region data after the
+ * opheader, then we need to start afresh with a new iclog.
+ */
+ if (iclog->ic_size - *log_offset <=
+ sizeof(struct xlog_op_header)) {
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset, *len, record_cnt,
+ data_cnt);
+ if (error)
+ return error;
+ }
- return sizeof(struct xlog_op_header);
-}
+ ophdr = reg->i_addr;
+ rlen = min_t(uint32_t, reg->i_len, iclog->ic_size - *log_offset);
-static int
-xlog_write_copy_finish(
- struct xlog *log,
- struct xlog_in_core *iclog,
- uint flags,
- int *record_cnt,
- int *data_cnt,
- int *partial_copy,
- int *partial_copy_len,
- int log_offset)
-{
- int error;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_len = cpu_to_be32(rlen - sizeof(struct xlog_op_header));
+ if (rlen != reg->i_len)
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
+
+ xlog_write_iovec(iclog, log_offset, reg->i_addr,
+ rlen, len, record_cnt, data_cnt);
+
+ /* If we wrote the whole region, move to the next. */
+ if (rlen == reg->i_len)
+ continue;
- if (*partial_copy) {
/*
- * This iclog has already been marked WANT_SYNC by
- * xlog_state_get_iclog_space.
+ * We now have a partially written iovec, but it can span
+ * multiple iclogs so we loop here. First we release the iclog
+ * we currently have, then we get a new iclog and add a new
+ * opheader. Then we continue copying from where we were until
+ * we either complete the iovec or fill the iclog. If we
+ * complete the iovec, then we increment the index and go right
+ * back to the top of the outer loop. if we fill the iclog, we
+ * run the inner loop again.
+ *
+ * This is complicated by the tail of a region using all the
+ * space in an iclog and hence requiring us to release the iclog
+ * and get a new one before returning to the outer loop. We must
+ * always guarantee that we exit this inner loop with at least
+ * space for log transaction opheaders left in the current
+ * iclog, hence we cannot just terminate the loop at the end
+ * of the of the continuation. So we loop while there is no
+ * space left in the current iclog, and check for the end of the
+ * continuation after getting a new iclog.
*/
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
- goto release_iclog;
- }
+ do {
+ /*
+ * Ensure we include the continuation opheader in the
+ * space we need in the new iclog by adding that size
+ * to the length we require. This continuation opheader
+ * needs to be accounted to the ticket as the space it
+ * consumes hasn't been accounted to the lv we are
+ * writing.
+ */
+ error = xlog_write_get_more_iclog_space(ticket,
+ &iclog, log_offset,
+ *len + sizeof(struct xlog_op_header),
+ record_cnt, data_cnt);
+ if (error)
+ return error;
- *partial_copy = 0;
- *partial_copy_len = 0;
+ ophdr = iclog->ic_datap + *log_offset;
+ ophdr->oh_tid = cpu_to_be32(ticket->t_tid);
+ ophdr->oh_clientid = XFS_TRANSACTION;
+ ophdr->oh_res2 = 0;
+ ophdr->oh_flags = XLOG_WAS_CONT_TRANS;
- if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t))
- return 0;
+ ticket->t_curr_res -= sizeof(struct xlog_op_header);
+ *log_offset += sizeof(struct xlog_op_header);
+ *data_cnt += sizeof(struct xlog_op_header);
- /* no more space in this iclog - push it. */
- spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
- *record_cnt = 0;
- *data_cnt = 0;
+ /*
+ * If rlen fits in the iclog, then end the region
+ * continuation. Otherwise we're going around again.
+ */
+ reg_offset += rlen;
+ rlen = reg->i_len - reg_offset;
+ if (rlen <= iclog->ic_size - *log_offset)
+ ophdr->oh_flags |= XLOG_END_TRANS;
+ else
+ ophdr->oh_flags |= XLOG_CONTINUE_TRANS;
- if (iclog->ic_state == XLOG_STATE_ACTIVE)
- xlog_state_switch_iclogs(log, iclog, 0);
- else
- ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
- xlog_is_shutdown(log));
-release_iclog:
- error = xlog_state_release_iclog(log, iclog);
- spin_unlock(&log->l_icloglock);
- return error;
+ rlen = min_t(uint32_t, rlen, iclog->ic_size - *log_offset);
+ ophdr->oh_len = cpu_to_be32(rlen);
+
+ xlog_write_iovec(iclog, log_offset,
+ reg->i_addr + reg_offset,
+ rlen, len, record_cnt, data_cnt);
+
+ } while (ophdr->oh_flags & XLOG_CONTINUE_TRANS);
+ }
+
+ /*
+ * No more iovecs remain in this logvec so return the next log vec to
+ * the caller so it can go back to fast path copying.
+ */
+ *iclogp = iclog;
+ return 0;
}
/*
@@ -2449,27 +2468,16 @@ xlog_write(
struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector,
struct xlog_ticket *ticket,
- uint optype)
+ uint32_t len)
+
{
struct xlog_in_core *iclog = NULL;
struct xfs_log_vec *lv = log_vector;
- struct xfs_log_iovec *vecp = lv->lv_iovecp;
- int index = 0;
- int len;
- int partial_copy = 0;
- int partial_copy_len = 0;
- int contwr = 0;
- int record_cnt = 0;
- int data_cnt = 0;
+ uint32_t record_cnt = 0;
+ uint32_t data_cnt = 0;
int error = 0;
+ int log_offset;
- /*
- * If this is a commit or unmount transaction, we don't need a start
- * record to be written. We do, however, have to account for the
- * commit or unmount header that gets written. Hence we always have
- * to account for an extra xlog_op_header here.
- */
- ticket->t_curr_res -= sizeof(struct xlog_op_header);
if (ticket->t_curr_res < 0) {
xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
"ctx ticket reservation ran out. Need to up reservation");
@@ -2477,144 +2485,54 @@ xlog_write(
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
}
- len = xlog_write_calc_vec_length(ticket, log_vector, optype);
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- void *ptr;
- int log_offset;
-
- error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
- &contwr, &log_offset);
- if (error)
- return error;
+ error = xlog_state_get_iclog_space(log, len, &iclog, ticket,
+ &log_offset);
+ if (error)
+ return error;
- ASSERT(log_offset <= iclog->ic_size - 1);
- ptr = iclog->ic_datap + log_offset;
+ ASSERT(log_offset <= iclog->ic_size - 1);
- /*
- * If we have a context pointer, pass it the first iclog we are
- * writing to so it can record state needed for iclog write
- * ordering.
- */
- if (ctx) {
- xlog_cil_set_ctx_write_state(ctx, iclog);
- ctx = NULL;
- }
+ /*
+ * If we have a context pointer, pass it the first iclog we are
+ * writing to so it can record state needed for iclog write
+ * ordering.
+ */
+ if (ctx)
+ xlog_cil_set_ctx_write_state(ctx, iclog);
+ while (lv) {
/*
- * This loop writes out as many regions as can fit in the amount
- * of space which was allocated by xlog_state_get_iclog_space().
+ * If the entire log vec does not fit in the iclog, punt it to
+ * the partial copy loop which can handle this case.
*/
- while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) {
- struct xfs_log_iovec *reg;
- struct xlog_op_header *ophdr;
- int copy_len;
- int copy_off;
- bool ordered = false;
- bool wrote_start_rec = false;
-
- /* ordered log vectors have no regions to write */
- if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) {
- ASSERT(lv->lv_niovecs == 0);
- ordered = true;
- goto next_lv;
- }
-
- reg = &vecp[index];
- ASSERT(reg->i_len % sizeof(int32_t) == 0);
- ASSERT((unsigned long)ptr % sizeof(int32_t) == 0);
-
- /*
- * Before we start formatting log vectors, we need to
- * write a start record. Only do this for the first
- * iclog we write to.
- */
- if (optype & XLOG_START_TRANS) {
- xlog_write_start_rec(ptr, ticket);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
- optype &= ~XLOG_START_TRANS;
- wrote_start_rec = true;
- }
-
- ophdr = xlog_write_setup_ophdr(log, ptr, ticket, optype);
- if (!ophdr)
- return -EIO;
-
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- sizeof(struct xlog_op_header));
-
- len += xlog_write_setup_copy(ticket, ophdr,
- iclog->ic_size-log_offset,
- reg->i_len,
- &copy_off, &copy_len,
- &partial_copy,
- &partial_copy_len);
- xlog_verify_dest_ptr(log, ptr);
-
- /*
- * Copy region.
- *
- * Unmount records just log an opheader, so can have
- * empty payloads with no data region to copy. Hence we
- * only copy the payload if the vector says it has data
- * to copy.
- */
- ASSERT(copy_len >= 0);
- if (copy_len > 0) {
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset,
- copy_len);
- }
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- if (wrote_start_rec) {
- copy_len += sizeof(struct xlog_op_header);
- record_cnt++;
- }
- data_cnt += contwr ? copy_len : 0;
-
- error = xlog_write_copy_finish(log, iclog, optype,
- &record_cnt, &data_cnt,
- &partial_copy,
- &partial_copy_len,
- log_offset);
- if (error)
+ if (lv->lv_niovecs &&
+ lv->lv_bytes > iclog->ic_size - log_offset) {
+ error = xlog_write_partial(lv, ticket, &iclog,
+ &log_offset, &len, &record_cnt,
+ &data_cnt);
+ if (error) {
+ /*
+ * We have no iclog to release, so just return
+ * the error immediately.
+ */
return error;
-
- /*
- * if we had a partial copy, we need to get more iclog
- * space but we don't want to increment the region
- * index because there is still more is this region to
- * write.
- *
- * If we completed writing this region, and we flushed
- * the iclog (indicated by resetting of the record
- * count), then we also need to get more log space. If
- * this was the last record, though, we are done and
- * can just return.
- */
- if (partial_copy)
- break;
-
- if (++index == lv->lv_niovecs) {
-next_lv:
- lv = lv->lv_next;
- index = 0;
- if (lv)
- vecp = lv->lv_iovecp;
- }
- if (record_cnt == 0 && !ordered) {
- if (!lv)
- return 0;
- break;
}
+ } else {
+ xlog_write_full(lv, ticket, iclog, &log_offset,
+ &len, &record_cnt, &data_cnt);
}
+ lv = lv->lv_next;
}
-
ASSERT(len == 0);
+ /*
+ * We've already been guaranteed that the last writes will fit inside
+ * the current iclog, and hence it will already have the space used by
+ * those writes accounted to it. Hence we do not need to update the
+ * iclog with the number of bytes written here.
+ */
spin_lock(&log->l_icloglock);
- xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+ xlog_state_finish_copy(log, iclog, record_cnt, 0);
error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock);
@@ -2971,7 +2889,6 @@ xlog_state_get_iclog_space(
int len,
struct xlog_in_core **iclogp,
struct xlog_ticket *ticket,
- int *continued_write,
int *logoffsetp)
{
int log_offset;
@@ -3008,9 +2925,6 @@ restart:
*/
if (log_offset == 0) {
ticket->t_curr_res -= log->l_iclog_hsize;
- xlog_tic_add_region(ticket,
- log->l_iclog_hsize,
- XLOG_REG_TYPE_LRHEADER);
head->h_cycle = cpu_to_be32(log->l_curr_cycle);
head->h_lsn = cpu_to_be64(
xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block));
@@ -3052,13 +2966,10 @@ restart:
* iclogs (to mark it taken), this particular iclog will release/sync
* to disk in xlog_write().
*/
- if (len <= iclog->ic_size - iclog->ic_offset) {
- *continued_write = 0;
+ if (len <= iclog->ic_size - iclog->ic_offset)
iclog->ic_offset += len;
- } else {
- *continued_write = 1;
+ else
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- }
*iclogp = iclog;
ASSERT(iclog->ic_offset <= iclog->ic_size);
@@ -3090,7 +3001,6 @@ xfs_log_ticket_regrant(
xlog_grant_sub_space(log, &log->l_write_head.grant,
ticket->t_curr_res);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
trace_xfs_log_ticket_regrant_sub(log, ticket);
@@ -3101,7 +3011,6 @@ xfs_log_ticket_regrant(
trace_xfs_log_ticket_regrant_exit(log, ticket);
ticket->t_curr_res = ticket->t_unit_res;
- xlog_tic_reset_res(ticket);
}
xfs_log_ticket_put(ticket);
@@ -3591,7 +3500,6 @@ xlog_ticket_alloc(
struct xlog *log,
int unit_bytes,
int cnt,
- char client,
bool permanent)
{
struct xlog_ticket *tic;
@@ -3609,40 +3517,14 @@ xlog_ticket_alloc(
tic->t_cnt = cnt;
tic->t_ocnt = cnt;
tic->t_tid = prandom_u32();
- tic->t_clientid = client;
if (permanent)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- xlog_tic_reset_res(tic);
-
return tic;
}
#if defined(DEBUG)
/*
- * Make sure that the destination ptr is within the valid data region of
- * one of the iclogs. This uses backup pointers stored in a different
- * part of the log in case we trash the log structure.
- */
-STATIC void
-xlog_verify_dest_ptr(
- struct xlog *log,
- void *ptr)
-{
- int i;
- int good_ptr = 0;
-
- for (i = 0; i < log->l_iclog_bufs; i++) {
- if (ptr >= log->l_iclog_bak[i] &&
- ptr <= log->l_iclog_bak[i] + log->l_iclog_size)
- good_ptr++;
- }
-
- if (!good_ptr)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
-}
-
-/*
* Check to make sure the grant write head didn't just over lap the tail. If
* the cycles are the same, we can't be overlapping. Otherwise, make sure that
* the cycles differ by exactly one and check the byte count.
@@ -3769,7 +3651,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
clientid = ophead->oh_clientid;
} else {
- idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_clientid - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3780,11 +3662,12 @@ xlog_verify_iclog(
iclog->ic_header.h_cycle_data[idx]);
}
}
- if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
+ if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) {
xfs_warn(log->l_mp,
- "%s: invalid clientid %d op "PTR_FMT" offset 0x%lx",
- __func__, clientid, ophead,
+ "%s: op %d invalid clientid %d op "PTR_FMT" offset 0x%lx",
+ __func__, i, clientid, ophead,
(unsigned long)field_offset);
+ }
/* check length */
p = &ophead->oh_len;
@@ -3792,8 +3675,7 @@ xlog_verify_iclog(
if (field_offset & 0x1ff) {
op_len = be32_to_cpu(ophead->oh_len);
} else {
- idx = BTOBBT((uintptr_t)&ophead->oh_len -
- (uintptr_t)iclog->ic_datap);
+ idx = BTOBBT((void *)&ophead->oh_len - iclog->ic_datap);
if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) {
j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3829,7 +3711,7 @@ xlog_verify_iclog(
bool
xlog_force_shutdown(
struct xlog *log,
- int shutdown_flags)
+ uint32_t shutdown_flags)
{
bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
@@ -3995,3 +3877,44 @@ xlog_drop_incompat_feat(
{
up_read(&log->l_incompat_users);
}
+
+/*
+ * Get permission to use log-assisted atomic exchange of file extents.
+ *
+ * Callers must not be running any transactions or hold any inode locks, and
+ * they must release the permission by calling xlog_drop_incompat_feat
+ * when they're done.
+ */
+int
+xfs_attr_use_log_assist(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /*
+ * Protect ourselves from an idle log clearing the logged xattrs log
+ * incompat feature bit.
+ */
+ xlog_use_incompat_feat(mp->m_log);
+
+ /*
+ * If log-assisted xattrs are already enabled, the caller can use the
+ * log assisted swap functions with the log-incompat reference we got.
+ */
+ if (xfs_sb_version_haslogxattrs(&mp->m_sb))
+ return 0;
+
+ /* Enable log-assisted xattrs. */
+ error = xfs_add_incompat_log_feature(mp,
+ XFS_SB_FEAT_INCOMPAT_LOG_XATTRS);
+ if (error)
+ goto drop_incompat;
+
+ xfs_warn_once(mp,
+"EXPERIMENTAL logged extended attributes feature added. Use at your own risk!");
+
+ return 0;
+drop_incompat:
+ xlog_drop_incompat_feat(mp->m_log);
+ return error;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index dc1b77b92fc1..252b098cde1f 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -21,46 +21,59 @@ struct xfs_log_vec {
#define XFS_LOG_VEC_ORDERED (-1)
-static inline void *
-xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
- uint type)
+/*
+ * Calculate the log iovec length for a given user buffer length. Intended to be
+ * used by ->iop_size implementations when sizing buffers of arbitrary
+ * alignments.
+ */
+static inline int
+xlog_calc_iovec_len(int len)
{
- struct xfs_log_iovec *vec = *vecp;
+ return roundup(len, sizeof(uint32_t));
+}
- if (vec) {
- ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
- vec++;
- } else {
- vec = &lv->lv_iovecp[0];
+void *xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type);
+
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
+ int data_len)
+{
+ struct xlog_op_header *oph = vec->i_addr;
+ int len;
+
+ /*
+ * Always round up the length to the correct alignment so callers don't
+ * need to know anything about this log vec layout requirement. This
+ * means we have to zero the area the data to be written does not cover.
+ * This is complicated by fact the payload region is offset into the
+ * logvec region by the opheader that tracks the payload.
+ */
+ len = xlog_calc_iovec_len(data_len);
+ if (len - data_len != 0) {
+ char *buf = vec->i_addr + sizeof(struct xlog_op_header);
+
+ memset(buf + data_len, 0, len - data_len);
}
- vec->i_type = type;
- vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+ /*
+ * The opheader tracks aligned payload length, whilst the logvec tracks
+ * the overall region length.
+ */
+ oph->oh_len = cpu_to_be32(len);
- ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+ len += sizeof(struct xlog_op_header);
+ lv->lv_buf_len += len;
+ lv->lv_bytes += len;
+ vec->i_len = len;
- *vecp = vec;
- return vec->i_addr;
+ /* Catch buffer overruns */
+ ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
}
/*
- * We need to make sure the next buffer is naturally aligned for the biggest
- * basic data type we put into it. We already accounted for this padding when
- * sizing the buffer.
- *
- * However, this padding does not get written into the log, and hence we have to
- * track the space used by the log vectors separately to prevent log space hangs
- * due to inaccurate accounting (i.e. a leak) of the used log space through the
- * CIL context ticket.
+ * Copy the amount of data requested by the caller into a new log iovec.
*/
-static inline void
-xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
-{
- lv->lv_buf_len += round_up(len, sizeof(uint64_t));
- lv->lv_bytes += len;
- vec->i_len = len;
-}
-
static inline void *
xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
uint type, void *data, int len)
@@ -117,15 +130,11 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
void xfs_log_mount_cancel(struct xfs_mount *);
xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
-void xfs_log_space_wake(struct xfs_mount *mp);
-int xfs_log_reserve(struct xfs_mount *mp,
- int length,
- int count,
- struct xlog_ticket **ticket,
- uint8_t clientid,
- bool permanent);
-int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
-void xfs_log_unmount(struct xfs_mount *mp);
+void xfs_log_space_wake(struct xfs_mount *mp);
+int xfs_log_reserve(struct xfs_mount *mp, int length, int count,
+ struct xlog_ticket **ticket, bool permanent);
+int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+void xfs_log_unmount(struct xfs_mount *mp);
bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
@@ -140,9 +149,10 @@ void xfs_log_clean(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
-bool xlog_force_shutdown(struct xlog *log, int shutdown_flags);
+bool xlog_force_shutdown(struct xlog *log, uint32_t shutdown_flags);
void xlog_use_incompat_feat(struct xlog *log);
void xlog_drop_incompat_feat(struct xlog *log);
+int xfs_attr_use_log_assist(struct xfs_mount *mp);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index c9f55e4f0957..db6cb7800251 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -37,7 +37,7 @@ xlog_cil_ticket_alloc(
{
struct xlog_ticket *tic;
- tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0);
+ tic = xlog_ticket_alloc(log, 0, 1, 0);
/*
* set the current reservation to zero so we know to steal the basic
@@ -48,6 +48,38 @@ xlog_cil_ticket_alloc(
}
/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+static bool
+xlog_item_in_current_chkpt(
+ struct xfs_cil *cil,
+ struct xfs_log_item *lip)
+{
+ if (list_empty(&lip->li_cil))
+ return false;
+
+ /*
+ * li_seq is written on the first commit of a log item to record the
+ * first checkpoint it is written to. Hence if it is different to the
+ * current sequence, we're in a new checkpoint.
+ */
+ return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
+}
+
+bool
+xfs_log_item_in_current_chkpt(
+ struct xfs_log_item *lip)
+{
+ return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip);
+}
+
+/*
* Unavoidable forward declaration - xlog_cil_push_work() calls
* xlog_cil_ctx_alloc() itself.
*/
@@ -103,39 +135,6 @@ xlog_cil_iovec_space(
}
/*
- * shadow buffers can be large, so we need to use kvmalloc() here to ensure
- * success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall
- * back to vmalloc, so we can't actually do anything useful with gfp flags to
- * control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do
- * direct reclaim and compaction in the slow path, both of which are
- * horrendously expensive. We just want kmalloc to fail fast and fall back to
- * vmalloc if it can't get somethign straight away from the free lists or buddy
- * allocator. Hence we have to open code kvmalloc outselves here.
- *
- * Also, we are in memalloc_nofs_save task context here, so despite the use of
- * GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This
- * is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets
- * just all pretend this is a GFP_KERNEL context operation....
- */
-static inline void *
-xlog_cil_kvmalloc(
- size_t buf_size)
-{
- gfp_t flags = GFP_KERNEL;
- void *p;
-
- flags &= ~__GFP_DIRECT_RECLAIM;
- flags |= __GFP_NOWARN | __GFP_NORETRY;
- do {
- p = kmalloc(buf_size, flags);
- if (!p)
- p = vmalloc(buf_size);
- } while (!p);
-
- return p;
-}
-
-/*
* Allocate or pin log vector buffers for CIL insertion.
*
* The CIL currently uses disposable buffers for copying a snapshot of the
@@ -214,13 +213,20 @@ xlog_cil_alloc_shadow_bufs(
}
/*
- * We 64-bit align the length of each iovec so that the start
- * of the next one is naturally aligned. We'll need to
- * account for that slack space here. Then round nbytes up
- * to 64-bit alignment so that the initial buffer alignment is
- * easy to calculate and verify.
+ * We 64-bit align the length of each iovec so that the start of
+ * the next one is naturally aligned. We'll need to account for
+ * that slack space here.
+ *
+ * We also add the xlog_op_header to each region when
+ * formatting, but that's not accounted to the size of the item
+ * at this point. Hence we'll need an addition number of bytes
+ * for each vector to hold an opheader.
+ *
+ * Then round nbytes up to 64-bit alignment so that the initial
+ * buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs * sizeof(uint64_t);
+ nbytes += niovecs *
+ (sizeof(uint64_t) + sizeof(struct xlog_op_header));
nbytes = round_up(nbytes, sizeof(uint64_t));
/*
@@ -244,7 +250,7 @@ xlog_cil_alloc_shadow_bufs(
* storage.
*/
kmem_free(lip->li_lv_shadow);
- lv = xlog_cil_kvmalloc(buf_size);
+ lv = xlog_kvmalloc(buf_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
@@ -277,22 +283,18 @@ xlog_cil_alloc_shadow_bufs(
/*
* Prepare the log item for insertion into the CIL. Calculate the difference in
- * log space and vectors it will consume, and if it is a new item pin it as
- * well.
+ * log space it will consume, and if it is a new item pin it as well.
*/
STATIC void
xfs_cil_prepare_item(
struct xlog *log,
struct xfs_log_vec *lv,
struct xfs_log_vec *old_lv,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
/* Account for the new LV being passed in */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
*diff_len += lv->lv_bytes;
- *diff_iovecs += lv->lv_niovecs;
- }
/*
* If there is no old LV, this is the first time we've seen the item in
@@ -309,7 +311,6 @@ xfs_cil_prepare_item(
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
*diff_len -= old_lv->lv_bytes;
- *diff_iovecs -= old_lv->lv_niovecs;
lv->lv_item->li_lv_shadow = old_lv;
}
@@ -358,12 +359,10 @@ static void
xlog_cil_insert_format_items(
struct xlog *log,
struct xfs_trans *tp,
- int *diff_len,
- int *diff_iovecs)
+ int *diff_len)
{
struct xfs_log_item *lip;
-
/* Bail out if we didn't find a log item. */
if (list_empty(&tp->t_items)) {
ASSERT(0);
@@ -406,7 +405,6 @@ xlog_cil_insert_format_items(
* set the item up as though it is a new insertion so
* that the space reservation accounting is correct.
*/
- *diff_iovecs -= lv->lv_niovecs;
*diff_len -= lv->lv_bytes;
/* Ensure the lv is set up according to ->iop_size */
@@ -431,7 +429,7 @@ xlog_cil_insert_format_items(
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
lip->li_ops->iop_format(lip, lv);
insert:
- xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
+ xfs_cil_prepare_item(log, lv, old_lv, diff_len);
}
}
@@ -445,13 +443,13 @@ insert:
static void
xlog_cil_insert_items(
struct xlog *log,
- struct xfs_trans *tp)
+ struct xfs_trans *tp,
+ uint32_t released_space)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx = cil->xc_ctx;
struct xfs_log_item *lip;
int len = 0;
- int diff_iovecs = 0;
int iclog_space;
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
@@ -461,15 +459,10 @@ xlog_cil_insert_items(
* We can do this safely because the context can't checkpoint until we
* are done so it doesn't matter exactly how we update the CIL.
*/
- xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs);
+ xlog_cil_insert_format_items(log, tp, &len);
spin_lock(&cil->xc_cil_lock);
- /* account for space used by new iovec headers */
- iovhdr_res = diff_iovecs * sizeof(xlog_op_header_t);
- len += iovhdr_res;
- ctx->nvecs += diff_iovecs;
-
/* attach the transaction to the CIL if it has any busy extents */
if (!list_empty(&tp->t_busy))
list_splice_init(&tp->t_busy, &ctx->busy_extents);
@@ -500,7 +493,9 @@ xlog_cil_insert_items(
ASSERT(tp->t_ticket->t_curr_res >= len);
}
tp->t_ticket->t_curr_res -= len;
+ tp->t_ticket->t_curr_res += released_space;
ctx->space_used += len;
+ ctx->space_used -= released_space;
/*
* If we've overrun the reservation, dump the tx details before we move
@@ -822,7 +817,8 @@ restart:
static int
xlog_cil_write_chain(
struct xfs_cil_ctx *ctx,
- struct xfs_log_vec *chain)
+ struct xfs_log_vec *chain,
+ uint32_t chain_len)
{
struct xlog *log = ctx->cil->xc_log;
int error;
@@ -830,7 +826,7 @@ xlog_cil_write_chain(
error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD);
if (error)
return error;
- return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS);
+ return xlog_write(log, ctx, chain, ctx->ticket, chain_len);
}
/*
@@ -844,9 +840,14 @@ xlog_cil_write_commit_record(
struct xfs_cil_ctx *ctx)
{
struct xlog *log = ctx->cil->xc_log;
+ struct xlog_op_header ophdr = {
+ .oh_clientid = XFS_TRANSACTION,
+ .oh_tid = cpu_to_be32(ctx->ticket->t_tid),
+ .oh_flags = XLOG_COMMIT_TRANS,
+ };
struct xfs_log_iovec reg = {
- .i_addr = NULL,
- .i_len = 0,
+ .i_addr = &ophdr,
+ .i_len = sizeof(struct xlog_op_header),
.i_type = XLOG_REG_TYPE_COMMIT,
};
struct xfs_log_vec vec = {
@@ -862,12 +863,138 @@ xlog_cil_write_commit_record(
if (error)
return error;
- error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS);
+ /* account for space used by record data */
+ ctx->ticket->t_curr_res -= reg.i_len;
+ error = xlog_write(log, ctx, &vec, ctx->ticket, reg.i_len);
if (error)
xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
return error;
}
+struct xlog_cil_trans_hdr {
+ struct xlog_op_header oph[2];
+ struct xfs_trans_header thdr;
+ struct xfs_log_iovec lhdr[2];
+};
+
+/*
+ * Build a checkpoint transaction header to begin the journal transaction. We
+ * need to account for the space used by the transaction header here as it is
+ * not accounted for in xlog_write().
+ *
+ * This is the only place we write a transaction header, so we also build the
+ * log opheaders that indicate the start of a log transaction and wrap the
+ * transaction header. We keep the start record in it's own log vector rather
+ * than compacting them into a single region as this ends up making the logic
+ * in xlog_write() for handling empty opheaders for start, commit and unmount
+ * records much simpler.
+ */
+static void
+xlog_cil_build_trans_hdr(
+ struct xfs_cil_ctx *ctx,
+ struct xlog_cil_trans_hdr *hdr,
+ struct xfs_log_vec *lvhdr,
+ int num_iovecs)
+{
+ struct xlog_ticket *tic = ctx->ticket;
+ __be32 tid = cpu_to_be32(tic->t_tid);
+
+ memset(hdr, 0, sizeof(*hdr));
+
+ /* Log start record */
+ hdr->oph[0].oh_tid = tid;
+ hdr->oph[0].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[0].oh_flags = XLOG_START_TRANS;
+
+ /* log iovec region pointer */
+ hdr->lhdr[0].i_addr = &hdr->oph[0];
+ hdr->lhdr[0].i_len = sizeof(struct xlog_op_header);
+ hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER;
+
+ /* log opheader */
+ hdr->oph[1].oh_tid = tid;
+ hdr->oph[1].oh_clientid = XFS_TRANSACTION;
+ hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header));
+
+ /* transaction header in host byte order format */
+ hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+ hdr->thdr.th_type = XFS_TRANS_CHECKPOINT;
+ hdr->thdr.th_tid = tic->t_tid;
+ hdr->thdr.th_num_items = num_iovecs;
+
+ /* log iovec region pointer */
+ hdr->lhdr[1].i_addr = &hdr->oph[1];
+ hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_trans_header);
+ hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR;
+
+ lvhdr->lv_niovecs = 2;
+ lvhdr->lv_iovecp = &hdr->lhdr[0];
+ lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len;
+ lvhdr->lv_next = ctx->lv_chain;
+
+ tic->t_curr_res -= lvhdr->lv_bytes;
+}
+
+/*
+ * Pull all the log vectors off the items in the CIL, and remove the items from
+ * the CIL. We don't need the CIL lock here because it's only needed on the
+ * transaction commit side which is currently locked out by the flush lock.
+ *
+ * If a log item is marked with a whiteout, we do not need to write it to the
+ * journal and so we just move them to the whiteout list for the caller to
+ * dispose of appropriately.
+ */
+static void
+xlog_cil_build_lv_chain(
+ struct xfs_cil *cil,
+ struct xfs_cil_ctx *ctx,
+ struct list_head *whiteouts,
+ uint32_t *num_iovecs,
+ uint32_t *num_bytes)
+{
+ struct xfs_log_vec *lv = NULL;
+
+ while (!list_empty(&cil->xc_cil)) {
+ struct xfs_log_item *item;
+
+ item = list_first_entry(&cil->xc_cil,
+ struct xfs_log_item, li_cil);
+
+ if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) {
+ list_move(&item->li_cil, whiteouts);
+ trace_xfs_cil_whiteout_skip(item);
+ continue;
+ }
+
+ list_del_init(&item->li_cil);
+ if (!ctx->lv_chain)
+ ctx->lv_chain = item->li_lv;
+ else
+ lv->lv_next = item->li_lv;
+ lv = item->li_lv;
+ item->li_lv = NULL;
+ *num_iovecs += lv->lv_niovecs;
+
+ /* we don't write ordered log vectors */
+ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ *num_bytes += lv->lv_bytes;
+ }
+}
+
+static void
+xlog_cil_cleanup_whiteouts(
+ struct list_head *whiteouts)
+{
+ while (!list_empty(whiteouts)) {
+ struct xfs_log_item *item = list_first_entry(whiteouts,
+ struct xfs_log_item, li_cil);
+ list_del_init(&item->li_cil);
+ trace_xfs_cil_whiteout_unpin(item);
+ item->li_ops->iop_unpin(item, 1);
+ }
+}
+
/*
* Push the Committed Item List to the log.
*
@@ -890,16 +1017,15 @@ xlog_cil_push_work(
container_of(work, struct xfs_cil_ctx, push_work);
struct xfs_cil *cil = ctx->cil;
struct xlog *log = cil->xc_log;
- struct xfs_log_vec *lv;
struct xfs_cil_ctx *new_ctx;
- struct xlog_ticket *tic;
- int num_iovecs;
+ int num_iovecs = 0;
+ int num_bytes = 0;
int error = 0;
- struct xfs_trans_header thdr;
- struct xfs_log_iovec lhdr;
+ struct xlog_cil_trans_hdr thdr;
struct xfs_log_vec lvhdr = { NULL };
xfs_csn_t push_seq;
bool push_commit_stable;
+ LIST_HEAD (whiteouts);
new_ctx = xlog_cil_ctx_alloc();
new_ctx->ticket = xlog_cil_ticket_alloc(log);
@@ -968,28 +1094,7 @@ xlog_cil_push_work(
list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock);
- /*
- * Pull all the log vectors off the items in the CIL, and remove the
- * items from the CIL. We don't need the CIL lock here because it's only
- * needed on the transaction commit side which is currently locked out
- * by the flush lock.
- */
- lv = NULL;
- num_iovecs = 0;
- while (!list_empty(&cil->xc_cil)) {
- struct xfs_log_item *item;
-
- item = list_first_entry(&cil->xc_cil,
- struct xfs_log_item, li_cil);
- list_del_init(&item->li_cil);
- if (!ctx->lv_chain)
- ctx->lv_chain = item->li_lv;
- else
- lv->lv_next = item->li_lv;
- lv = item->li_lv;
- item->li_lv = NULL;
- num_iovecs += lv->lv_niovecs;
- }
+ xlog_cil_build_lv_chain(cil, ctx, &whiteouts, &num_iovecs, &num_bytes);
/*
* Switch the contexts so we can drop the context lock and move out
@@ -1025,26 +1130,11 @@ xlog_cil_push_work(
* Build a checkpoint transaction header and write it to the log to
* begin the transaction. We need to account for the space used by the
* transaction header here as it is not accounted for in xlog_write().
- *
- * The LSN we need to pass to the log items on transaction commit is
- * the LSN reported by the first log vector write. If we use the commit
- * record lsn then we can move the tail beyond the grant write head.
*/
- tic = ctx->ticket;
- thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
- thdr.th_type = XFS_TRANS_CHECKPOINT;
- thdr.th_tid = tic->t_tid;
- thdr.th_num_items = num_iovecs;
- lhdr.i_addr = &thdr;
- lhdr.i_len = sizeof(xfs_trans_header_t);
- lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
- tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
-
- lvhdr.lv_niovecs = 1;
- lvhdr.lv_iovecp = &lhdr;
- lvhdr.lv_next = ctx->lv_chain;
-
- error = xlog_cil_write_chain(ctx, &lvhdr);
+ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs);
+ num_bytes += lvhdr.lv_bytes;
+
+ error = xlog_cil_write_chain(ctx, &lvhdr, num_bytes);
if (error)
goto out_abort_free_ticket;
@@ -1052,7 +1142,7 @@ xlog_cil_push_work(
if (error)
goto out_abort_free_ticket;
- xfs_log_ticket_ungrant(log, tic);
+ xfs_log_ticket_ungrant(log, ctx->ticket);
/*
* If the checkpoint spans multiple iclogs, wait for all previous iclogs
@@ -1107,6 +1197,7 @@ xlog_cil_push_work(
/* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock);
+ xlog_cil_cleanup_whiteouts(&whiteouts);
return;
out_skip:
@@ -1116,8 +1207,9 @@ out_skip:
return;
out_abort_free_ticket:
- xfs_log_ticket_ungrant(log, tic);
+ xfs_log_ticket_ungrant(log, ctx->ticket);
ASSERT(xlog_is_shutdown(log));
+ xlog_cil_cleanup_whiteouts(&whiteouts);
if (!ctx->commit_iclog) {
xlog_cil_committed(ctx);
return;
@@ -1267,6 +1359,43 @@ xlog_cil_empty(
}
/*
+ * If there are intent done items in this transaction and the related intent was
+ * committed in the current (same) CIL checkpoint, we don't need to write either
+ * the intent or intent done item to the journal as the change will be
+ * journalled atomically within this checkpoint. As we cannot remove items from
+ * the CIL here, mark the related intent with a whiteout so that the CIL push
+ * can remove it rather than writing it to the journal. Then remove the intent
+ * done item from the current transaction and release it so it doesn't get put
+ * into the CIL at all.
+ */
+static uint32_t
+xlog_cil_process_intents(
+ struct xfs_cil *cil,
+ struct xfs_trans *tp)
+{
+ struct xfs_log_item *lip, *ilip, *next;
+ uint32_t len = 0;
+
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE))
+ continue;
+
+ ilip = lip->li_ops->iop_intent(lip);
+ if (!ilip || !xlog_item_in_current_chkpt(cil, ilip))
+ continue;
+ set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
+ trace_xfs_cil_whiteout_mark(ilip);
+ len += ilip->li_lv->lv_bytes;
+ kmem_free(ilip->li_lv);
+ ilip->li_lv = NULL;
+
+ xfs_trans_del_item(lip);
+ lip->li_ops->iop_release(lip);
+ }
+ return len;
+}
+
+/*
* Commit a transaction with the given vector to the Committed Item List.
*
* To do this, we need to format the item, pin it in memory if required and
@@ -1288,6 +1417,7 @@ xlog_cil_commit(
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_item *lip, *next;
+ uint32_t released_space = 0;
/*
* Do all necessary memory allocation before we lock the CIL.
@@ -1299,7 +1429,10 @@ xlog_cil_commit(
/* lock out background commit */
down_read(&cil->xc_ctx_lock);
- xlog_cil_insert_items(log, tp);
+ if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE)
+ released_space = xlog_cil_process_intents(cil, tp);
+
+ xlog_cil_insert_items(log, tp, released_space);
if (regrant && !xlog_is_shutdown(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
@@ -1456,32 +1589,6 @@ out_shutdown:
}
/*
- * Check if the current log item was first committed in this sequence.
- * We can't rely on just the log item being in the CIL, we have to check
- * the recorded commit sequence number.
- *
- * Note: for this to be used in a non-racy manner, it has to be called with
- * CIL flushing locked out. As a result, it should only be used during the
- * transaction commit process when deciding what to format into the item.
- */
-bool
-xfs_log_item_in_current_chkpt(
- struct xfs_log_item *lip)
-{
- struct xfs_cil *cil = lip->li_log->l_cilp;
-
- if (list_empty(&lip->li_cil))
- return false;
-
- /*
- * li_seq is written on the first commit of a log item to record the
- * first checkpoint it is written to. Hence if it is different to the
- * current sequence, we're in a new checkpoint.
- */
- return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
-}
-
-/*
* Perform initial CIL structure initialisation.
*/
int
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 401cdc400980..67fd9789e69a 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -51,8 +51,8 @@ enum xlog_iclog_state {
/*
* In core log flags
*/
-#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
-#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
+#define XLOG_ICL_NEED_FLUSH (1u << 0) /* iclog needs REQ_PREFLUSH */
+#define XLOG_ICL_NEED_FUA (1u << 1) /* iclog needs REQ_FUA */
#define XLOG_ICL_STRINGS \
{ XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
@@ -62,7 +62,7 @@ enum xlog_iclog_state {
/*
* Log ticket flags
*/
-#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
+#define XLOG_TIC_PERM_RESERV (1u << 0) /* permanent reservation */
#define XLOG_TIC_FLAGS \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
@@ -142,19 +142,6 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-/* Ticket reservation region accounting */
-#define XLOG_TIC_LEN_MAX 15
-
-/*
- * Reservation region
- * As would be stored in xfs_log_iovec but without the i_addr which
- * we don't care about.
- */
-typedef struct xlog_res {
- uint r_len; /* region length :4 */
- uint r_type; /* region's transaction type :4 */
-} xlog_res_t;
-
typedef struct xlog_ticket {
struct list_head t_queue; /* reserve/write queue */
struct task_struct *t_task; /* task that owns this ticket */
@@ -164,15 +151,7 @@ typedef struct xlog_ticket {
int t_unit_res; /* unit reservation in bytes : 4 */
char t_ocnt; /* original count : 1 */
char t_cnt; /* current count : 1 */
- char t_clientid; /* who does this belong to; : 1 */
- char t_flags; /* properties of reservation : 1 */
-
- /* reservation array fields */
- uint t_res_num; /* num in array : 4 */
- uint t_res_num_ophdrs; /* num op hdrs : 4 */
- uint t_res_arr_sum; /* array sum : 4 */
- uint t_res_o_flow; /* sum overflow : 4 */
- xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */
+ uint8_t t_flags; /* properties of reservation : 1 */
} xlog_ticket_t;
/*
@@ -211,7 +190,7 @@ typedef struct xlog_in_core {
u32 ic_offset;
enum xlog_iclog_state ic_state;
unsigned int ic_flags;
- char *ic_datap; /* pointer to iclog data */
+ void *ic_datap; /* pointer to iclog data */
struct list_head ic_callbacks;
/* reference counts need their own cacheline */
@@ -242,7 +221,6 @@ struct xfs_cil_ctx {
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
- int nvecs; /* number of regions */
int space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
@@ -441,10 +419,6 @@ struct xlog {
struct xfs_kobj l_kobj;
- /* The following field are used for debugging; need to hold icloglock */
-#ifdef DEBUG
- void *l_iclog_bak[XLOG_MAX_ICLOGS];
-#endif
/* log recovery lsn tracking (for buffer submission */
xfs_lsn_t l_recovery_lsn;
@@ -509,27 +483,14 @@ extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
char *dp, int size);
extern struct kmem_cache *xfs_log_ticket_cache;
-struct xlog_ticket *
-xlog_ticket_alloc(
- struct xlog *log,
- int unit_bytes,
- int count,
- char client,
- bool permanent);
-
-static inline void
-xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
-{
- *ptr += bytes;
- *len -= bytes;
- *off += bytes;
-}
+struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
+ int count, bool permanent);
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
- uint optype);
+ uint32_t len);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
@@ -690,4 +651,38 @@ xlog_valid_lsn(
return valid;
}
+/*
+ * Log vector and shadow buffers can be large, so we need to use kvmalloc() here
+ * to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
+ * to fall back to vmalloc, so we can't actually do anything useful with gfp
+ * flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
+ * will do direct reclaim and compaction in the slow path, both of which are
+ * horrendously expensive. We just want kmalloc to fail fast and fall back to
+ * vmalloc if it can't get somethign straight away from the free lists or
+ * buddy allocator. Hence we have to open code kvmalloc outselves here.
+ *
+ * This assumes that the caller uses memalloc_nofs_save task context here, so
+ * despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
+ * allocations. This is actually the only way to make vmalloc() do GFP_NOFS
+ * allocations, so lets just all pretend this is a GFP_KERNEL context
+ * operation....
+ */
+static inline void *
+xlog_kvmalloc(
+ size_t buf_size)
+{
+ gfp_t flags = GFP_KERNEL;
+ void *p;
+
+ flags &= ~__GFP_DIRECT_RECLAIM;
+ flags |= __GFP_NOWARN | __GFP_NORETRY;
+ do {
+ p = kmalloc(buf_size, flags);
+ if (!p)
+ p = vmalloc(buf_size);
+ } while (!p);
+
+ return p;
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c4ad4296c540..97b941c07957 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1800,6 +1800,8 @@ static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
&xlog_cud_item_ops,
&xlog_bui_item_ops,
&xlog_bud_item_ops,
+ &xlog_attri_item_ops,
+ &xlog_attrd_item_ops,
};
static const struct xlog_recover_item_ops *
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index bc66d95c8d4c..8f495cc23903 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -27,42 +27,34 @@ __xfs_printk(
printk("%sXFS: %pV\n", level, vaf);
}
-#define define_xfs_printk_level(func, kern_level) \
-void func(const struct xfs_mount *mp, const char *fmt, ...) \
-{ \
- struct va_format vaf; \
- va_list args; \
- int level; \
- \
- va_start(args, fmt); \
- \
- vaf.fmt = fmt; \
- vaf.va = &args; \
- \
- __xfs_printk(kern_level, mp, &vaf); \
- va_end(args); \
- \
- if (!kstrtoint(kern_level, 0, &level) && \
- level <= LOGLEVEL_ERR && \
- xfs_error_level >= XFS_ERRLEVEL_HIGH) \
- xfs_stack_trace(); \
-} \
-
-define_xfs_printk_level(xfs_emerg, KERN_EMERG);
-define_xfs_printk_level(xfs_alert, KERN_ALERT);
-define_xfs_printk_level(xfs_crit, KERN_CRIT);
-define_xfs_printk_level(xfs_err, KERN_ERR);
-define_xfs_printk_level(xfs_warn, KERN_WARNING);
-define_xfs_printk_level(xfs_notice, KERN_NOTICE);
-define_xfs_printk_level(xfs_info, KERN_INFO);
-#ifdef DEBUG
-define_xfs_printk_level(xfs_debug, KERN_DEBUG);
-#endif
+void
+xfs_printk_level(
+ const char *kern_level,
+ const struct xfs_mount *mp,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int level;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ __xfs_printk(kern_level, mp, &vaf);
+
+ va_end(args);
+
+ if (!kstrtoint(kern_level, 0, &level) &&
+ level <= LOGLEVEL_ERR &&
+ xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
void
-xfs_alert_tag(
+_xfs_alert_tag(
const struct xfs_mount *mp,
- int panic_tag,
+ uint32_t panic_tag,
const char *fmt, ...)
{
struct va_format vaf;
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index bb9860ec9a93..55ee464ab59f 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -6,33 +6,46 @@
struct xfs_mount;
-extern __printf(2, 3)
-void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...);
extern __printf(3, 4)
-void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_err(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...);
-extern __printf(2, 3)
-void xfs_info(const struct xfs_mount *mp, const char *fmt, ...);
+void xfs_printk_level(const char *kern_level, const struct xfs_mount *mp,
+ const char *fmt, ...);
+#define xfs_printk_index_wrap(kern_level, mp, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", kern_level, fmt); \
+ xfs_printk_level(kern_level, mp, fmt, ##__VA_ARGS__); \
+})
+#define xfs_emerg(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_EMERG, mp, fmt, ##__VA_ARGS__)
+#define xfs_alert(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ALERT, mp, fmt, ##__VA_ARGS__)
+#define xfs_crit(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_CRIT, mp, fmt, ##__VA_ARGS__)
+#define xfs_err(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_ERR, mp, fmt, ##__VA_ARGS__)
+#define xfs_warn(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_WARNING, mp, fmt, ##__VA_ARGS__)
+#define xfs_notice(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_NOTICE, mp, fmt, ##__VA_ARGS__)
+#define xfs_info(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_INFO, mp, fmt, ##__VA_ARGS__)
#ifdef DEBUG
-extern __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...);
+#define xfs_debug(mp, fmt, ...) \
+ xfs_printk_index_wrap(KERN_DEBUG, mp, fmt, ##__VA_ARGS__)
#else
-static inline __printf(2, 3)
-void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-{
-}
+#define xfs_debug(mp, fmt, ...) do {} while (0)
#endif
+#define xfs_alert_tag(mp, tag, fmt, ...) \
+({ \
+ printk_index_subsys_emit("%sXFS%s: ", KERN_ALERT, fmt); \
+ _xfs_alert_tag(mp, tag, fmt, ##__VA_ARGS__); \
+})
+
+extern __printf(3, 4)
+void _xfs_alert_tag(const struct xfs_mount *mp, uint32_t tag,
+ const char *fmt, ...);
+
#define xfs_printk_ratelimited(func, dev, fmt, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c5f153c3693f..0c0bcbd4949d 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -468,6 +468,8 @@ STATIC int
xfs_check_summary_counts(
struct xfs_mount *mp)
{
+ int error = 0;
+
/*
* The AG0 superblock verifier rejects in-progress filesystems,
* so we should never see the flag set this far into mounting.
@@ -506,11 +508,32 @@ xfs_check_summary_counts(
* superblock to be correct and we don't need to do anything here.
* Otherwise, recalculate the summary counters.
*/
- if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) &&
- !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS))
- return 0;
+ if ((xfs_has_lazysbcount(mp) && !xfs_is_clean(mp)) ||
+ xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) {
+ error = xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ if (error)
+ return error;
+ }
- return xfs_initialize_perag_data(mp, mp->m_sb.sb_agcount);
+ /*
+ * Older kernels misused sb_frextents to reflect both incore
+ * reservations made by running transactions and the actual count of
+ * free rt extents in the ondisk metadata. Transactions committed
+ * during runtime can therefore contain a superblock update that
+ * undercounts the number of free rt extents tracked in the rt bitmap.
+ * A clean unmount record will have the correct frextents value since
+ * there can be no other transactions running at that point.
+ *
+ * If we're mounting the rt volume after recovering the log, recompute
+ * frextents from the rtbitmap file to fix the inconsistency.
+ */
+ if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ error = xfs_rtalloc_reinit_frextents(mp);
+ if (error)
+ return error;
+ }
+
+ return 0;
}
/*
@@ -784,11 +807,6 @@ xfs_mountfs(
goto out_inodegc_shrinker;
}
- /* Make sure the summary counts are ok. */
- error = xfs_check_summary_counts(mp);
- if (error)
- goto out_log_dealloc;
-
/* Enable background inode inactivation workers. */
xfs_inodegc_start(mp);
xfs_blockgc_start(mp);
@@ -844,6 +862,11 @@ xfs_mountfs(
goto out_rele_rip;
}
+ /* Make sure the summary counts are ok. */
+ error = xfs_check_summary_counts(mp);
+ if (error)
+ goto out_rtunmount;
+
/*
* If this is a read-only mount defer the superblock updates until
* the next remount into writeable mode. Otherwise we would never
@@ -1087,24 +1110,33 @@ xfs_fs_writable(
return true;
}
+/* Adjust m_fdblocks or m_frextents. */
int
-xfs_mod_fdblocks(
+xfs_mod_freecounter(
struct xfs_mount *mp,
+ struct percpu_counter *counter,
int64_t delta,
bool rsvd)
{
int64_t lcounter;
long long res_used;
+ uint64_t set_aside = 0;
s32 batch;
- uint64_t set_aside;
+ bool has_resv_pool;
+
+ ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
+ has_resv_pool = (counter == &mp->m_fdblocks);
+ if (rsvd)
+ ASSERT(has_resv_pool);
if (delta > 0) {
/*
* If the reserve pool is depleted, put blocks back into it
* first. Most of the time the pool is full.
*/
- if (likely(mp->m_resblks == mp->m_resblks_avail)) {
- percpu_counter_add(&mp->m_fdblocks, delta);
+ if (likely(!has_resv_pool ||
+ mp->m_resblks == mp->m_resblks_avail)) {
+ percpu_counter_add(counter, delta);
return 0;
}
@@ -1116,7 +1148,7 @@ xfs_mod_fdblocks(
} else {
delta -= res_used;
mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(&mp->m_fdblocks, delta);
+ percpu_counter_add(counter, delta);
}
spin_unlock(&mp->m_sb_lock);
return 0;
@@ -1130,7 +1162,7 @@ xfs_mod_fdblocks(
* then make everything serialise as we are real close to
* ENOSPC.
*/
- if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH,
+ if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@@ -1147,9 +1179,10 @@ xfs_mod_fdblocks(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- set_aside = xfs_fdblocks_unavailable(mp);
- percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
- if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+ if (has_resv_pool)
+ set_aside = xfs_fdblocks_unavailable(mp);
+ percpu_counter_add_batch(counter, delta, batch);
+ if (__percpu_counter_compare(counter, set_aside,
XFS_FDBLOCKS_BATCH) >= 0) {
/* we had space! */
return 0;
@@ -1160,8 +1193,8 @@ xfs_mod_fdblocks(
* that took us to ENOSPC.
*/
spin_lock(&mp->m_sb_lock);
- percpu_counter_add(&mp->m_fdblocks, -delta);
- if (!rsvd)
+ percpu_counter_add(counter, -delta);
+ if (!has_resv_pool || !rsvd)
goto fdblocks_enospc;
lcounter = (long long)mp->m_resblks_avail + delta;
@@ -1178,24 +1211,6 @@ fdblocks_enospc:
return -ENOSPC;
}
-int
-xfs_mod_frextents(
- struct xfs_mount *mp,
- int64_t delta)
-{
- int64_t lcounter;
- int ret = 0;
-
- spin_lock(&mp->m_sb_lock);
- lcounter = mp->m_sb.sb_frextents + delta;
- if (lcounter < 0)
- ret = -ENOSPC;
- else
- mp->m_sb.sb_frextents = lcounter;
- spin_unlock(&mp->m_sb_lock);
- return ret;
-}
-
/*
* Used to free the superblock along various error paths.
*/
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f6dc19de8322..8c42786e4942 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -183,6 +183,8 @@ typedef struct xfs_mount {
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
struct percpu_counter m_fdblocks; /* free block counter */
+ struct percpu_counter m_frextents; /* free rt extent counter */
+
/*
* Count of data device blocks reserved for delayed allocations,
* including indlen blocks. Does not include allocated CoW staging
@@ -276,6 +278,7 @@ typedef struct xfs_mount {
#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */
#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */
#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */
+#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
/* Mount features */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
@@ -338,6 +341,7 @@ __XFS_HAS_FEAT(realtime, REALTIME)
__XFS_HAS_FEAT(inobtcounts, INOBTCNT)
__XFS_HAS_FEAT(bigtime, BIGTIME)
__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
+__XFS_HAS_FEAT(large_extent_counts, NREXT64)
/*
* Mount features
@@ -425,16 +429,15 @@ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
#define XFS_MAX_IO_LOG 30 /* 1G */
#define XFS_MIN_IO_LOG PAGE_SHIFT
-#define xfs_is_shutdown(mp) xfs_is_shutdown(mp)
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
+void xfs_do_force_shutdown(struct xfs_mount *mp, uint32_t flags, char *fname,
int lnnum);
#define xfs_force_shutdown(m,f) \
xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
-#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
+#define SHUTDOWN_META_IO_ERROR (1u << 0) /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR (1u << 1) /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT (1u << 2) /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE (1u << 3) /* corrupt in-memory structures */
#define XFS_SHUTDOWN_STRINGS \
{ SHUTDOWN_META_IO_ERROR, "metadata_io" }, \
@@ -494,9 +497,20 @@ xfs_fdblocks_unavailable(
return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
}
-extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
- bool reserved);
-extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
+int xfs_mod_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+ int64_t delta, bool rsvd);
+
+static inline int
+xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved)
+{
+ return xfs_mod_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+}
+
+static inline int
+xfs_mod_frextents(struct xfs_mount *mp, int64_t delta)
+{
+ return xfs_mod_freecounter(mp, &mp->m_frextents, delta, false);
+}
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 25991923c1a8..758702b9495f 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -132,6 +132,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_STRUCT_SIZE(struct xfs_inode_log_format, 56);
XFS_CHECK_STRUCT_SIZE(struct xfs_qoff_logformat, 20);
XFS_CHECK_STRUCT_SIZE(struct xfs_trans_header, 16);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attri_log_format, 40);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attrd_log_format, 16);
/*
* The v5 superblock format extended several v4 header structures with
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index f165d1a3de1d..8fc813cb6011 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -582,9 +582,6 @@ xfs_qm_init_timelimits(
defq->blk.time = XFS_QM_BTIMELIMIT;
defq->ino.time = XFS_QM_ITIMELIMIT;
defq->rtb.time = XFS_QM_RTBTIMELIMIT;
- defq->blk.warn = XFS_QM_BWARNLIMIT;
- defq->ino.warn = XFS_QM_IWARNLIMIT;
- defq->rtb.warn = XFS_QM_RTBWARNLIMIT;
/*
* We try to get the limits from the superuser's limits fields.
@@ -608,12 +605,6 @@ xfs_qm_init_timelimits(
defq->ino.time = dqp->q_ino.timer;
if (dqp->q_rtb.timer)
defq->rtb.time = dqp->q_rtb.timer;
- if (dqp->q_blk.warnings)
- defq->blk.warn = dqp->q_blk.warnings;
- if (dqp->q_ino.warnings)
- defq->ino.warn = dqp->q_ino.warnings;
- if (dqp->q_rtb.warnings)
- defq->rtb.warn = dqp->q_rtb.warnings;
xfs_qm_dqdestroy(dqp);
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 5bb12717ea28..9683f0457d19 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -34,7 +34,6 @@ struct xfs_quota_limits {
xfs_qcnt_t hard; /* default hard limit */
xfs_qcnt_t soft; /* default soft limit */
time64_t time; /* limit for timers */
- xfs_qwarncnt_t warn; /* limit for warnings */
};
/* Defaults for each quota type: time limits, warn limits, usage limits */
@@ -134,10 +133,6 @@ struct xfs_dquot_acct {
#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */
#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */
-#define XFS_QM_BWARNLIMIT 5
-#define XFS_QM_IWARNLIMIT 5
-#define XFS_QM_RTBWARNLIMIT 5
-
extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
/* quota ops */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 7d5a31827681..74ac9ca9e119 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -217,8 +217,7 @@ xfs_qm_scall_quotaon(
return 0;
}
-#define XFS_QC_MASK \
- (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_MASK (QC_LIMIT_MASK | QC_TIMER_MASK)
/*
* Adjust limits of this quota, and the defaults if passed in. Returns true
@@ -251,17 +250,6 @@ xfs_setqlim_limits(
}
static inline void
-xfs_setqlim_warns(
- struct xfs_dquot_res *res,
- struct xfs_quota_limits *qlim,
- int warns)
-{
- res->warnings = warns;
- if (qlim)
- qlim->warn = warns;
-}
-
-static inline void
xfs_setqlim_timer(
struct xfs_mount *mp,
struct xfs_dquot_res *res,
@@ -354,8 +342,6 @@ xfs_qm_scall_setqlim(
if (xfs_setqlim_limits(mp, res, qlim, hard, soft, "blk"))
xfs_dquot_set_prealloc_limits(dqp);
- if (newlim->d_fieldmask & QC_SPC_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_spc_warns);
if (newlim->d_fieldmask & QC_SPC_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_spc_timer);
@@ -370,8 +356,6 @@ xfs_qm_scall_setqlim(
qlim = id == 0 ? &defq->rtb : NULL;
xfs_setqlim_limits(mp, res, qlim, hard, soft, "rtb");
- if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_rt_spc_warns);
if (newlim->d_fieldmask & QC_RT_SPC_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_rt_spc_timer);
@@ -386,8 +370,6 @@ xfs_qm_scall_setqlim(
qlim = id == 0 ? &defq->ino : NULL;
xfs_setqlim_limits(mp, res, qlim, hard, soft, "ino");
- if (newlim->d_fieldmask & QC_INO_WARNS)
- xfs_setqlim_warns(res, qlim, newlim->d_ino_warns);
if (newlim->d_fieldmask & QC_INO_TIMER)
xfs_setqlim_timer(mp, res, qlim, newlim->d_ino_timer);
@@ -428,13 +410,13 @@ xfs_qm_scall_getquota_fill_qc(
dst->d_ino_count = dqp->q_ino.reserved;
dst->d_spc_timer = dqp->q_blk.timer;
dst->d_ino_timer = dqp->q_ino.timer;
- dst->d_ino_warns = dqp->q_ino.warnings;
- dst->d_spc_warns = dqp->q_blk.warnings;
+ dst->d_ino_warns = 0;
+ dst->d_spc_warns = 0;
dst->d_rt_spc_hardlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.hardlimit);
dst->d_rt_spc_softlimit = XFS_FSB_TO_B(mp, dqp->q_rtb.softlimit);
dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_rtb.reserved);
dst->d_rt_spc_timer = dqp->q_rtb.timer;
- dst->d_rt_spc_warns = dqp->q_rtb.warnings;
+ dst->d_rt_spc_warns = 0;
/*
* Internally, we don't reset all the timers when quota enforcement
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 07989bd67728..9c162e69976b 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -40,9 +40,9 @@ xfs_qm_fill_state(
tstate->spc_timelimit = (u32)defq->blk.time;
tstate->ino_timelimit = (u32)defq->ino.time;
tstate->rt_spc_timelimit = (u32)defq->rtb.time;
- tstate->spc_warnlimit = defq->blk.warn;
- tstate->ino_warnlimit = defq->ino.warn;
- tstate->rt_spc_warnlimit = defq->rtb.warn;
+ tstate->spc_warnlimit = 0;
+ tstate->ino_warnlimit = 0;
+ tstate->rt_spc_warnlimit = 0;
if (tempqip)
xfs_irele(ip);
}
@@ -98,7 +98,7 @@ xfs_quota_type(int type)
}
}
-#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK)
+#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK)
/*
* Adjust quota timers & warnings
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 0d868c93144d..7e97bf19793d 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_cui_item_free(
struct xfs_cui_log_item *cuip)
{
+ kmem_free(cuip->cui_item.li_lv_shadow);
if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
kmem_free(cuip);
else
@@ -53,10 +54,11 @@ xfs_cui_release(
struct xfs_cui_log_item *cuip)
{
ASSERT(atomic_read(&cuip->cui_refcount) > 0);
- if (atomic_dec_and_test(&cuip->cui_refcount)) {
- xfs_trans_ail_delete(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_cui_item_free(cuip);
- }
+ if (!atomic_dec_and_test(&cuip->cui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&cuip->cui_item, 0);
+ xfs_cui_item_free(cuip);
}
@@ -204,14 +206,24 @@ xfs_cud_item_release(
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
xfs_cui_release(cudp->cud_cuip);
+ kmem_free(cudp->cud_item.li_lv_shadow);
kmem_cache_free(xfs_cud_cache, cudp);
}
+static struct xfs_log_item *
+xfs_cud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &CUD_ITEM(lip)->cud_cuip->cui_item;
+}
+
static const struct xfs_item_ops xfs_cud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_cud_item_size,
.iop_format = xfs_cud_item_format,
.iop_release = xfs_cud_item_release,
+ .iop_intent = xfs_cud_item_intent,
};
static struct xfs_cud_log_item *
@@ -259,7 +271,7 @@ xfs_trans_log_finish_refcount_update(
* 1.) releases the CUI and frees the CUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
return error;
@@ -600,6 +612,7 @@ xfs_cui_item_relog(
}
static const struct xfs_item_ops xfs_cui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
.iop_unpin = xfs_cui_item_unpin,
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 54e68e5693fd..e7a7c00d93be 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -586,21 +586,21 @@ out:
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
- xfs_fileoff_t offset_fsb,
- xfs_fileoff_t *end_fsb)
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
{
- struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
- xfs_filblks_t rlen;
unsigned int resblks;
+ int nmaps;
int error;
/* No COW extents? That's easy! */
if (ifp->if_bytes == 0) {
- *end_fsb = offset_fsb;
+ *offset_fsb = end_fsb;
return 0;
}
@@ -620,6 +620,9 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
goto out_cancel;
@@ -628,42 +631,66 @@ xfs_reflink_end_cow_extent(
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
- if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
- got.br_startoff + got.br_blockcount <= offset_fsb) {
- *end_fsb = offset_fsb;
+ if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
goto out_cancel;
}
/*
- * Structure copy @got into @del, then trim @del to the range that we
- * were asked to remap. We preserve @got for the eventual CoW fork
- * deletion; from now on @del represents the mapping that we're
- * actually remapping.
- */
- del = got;
- xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
-
- ASSERT(del.br_blockcount > 0);
-
- /*
* Only remap real extents that contain data. With AIO, speculative
* preallocations can leak into the range we are called upon, and we
- * need to skip them.
+ * need to skip them. Preserve @got for the eventual CoW fork
+ * deletion; from now on @del represents the mapping that we're
+ * actually remapping.
*/
- if (!xfs_bmap_is_written_extent(&got)) {
- *end_fsb = del.br_startoff;
- goto out_cancel;
+ while (!xfs_bmap_is_written_extent(&got)) {
+ if (!xfs_iext_next_extent(ifp, &icur, &got) ||
+ got.br_startoff >= end_fsb) {
+ *offset_fsb = end_fsb;
+ goto out_cancel;
+ }
}
+ del = got;
- /* Unmap the old blocks in the data fork. */
- rlen = del.br_blockcount;
- error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
+ /* Grab the corresponding mapping in the data fork. */
+ nmaps = 1;
+ error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
+ &nmaps, 0);
if (error)
goto out_cancel;
- /* Trim the extent to whatever got unmapped. */
- xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
- trace_xfs_reflink_cow_remap(ip, &del);
+ /* We can only remap the smaller of the two extent sizes. */
+ data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
+ del.br_blockcount = data.br_blockcount;
+
+ trace_xfs_reflink_cow_remap_from(ip, &del);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ if (xfs_bmap_is_real_extent(&data)) {
+ /*
+ * If the extent we're remapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
+ */
+ xfs_bmap_unmap_extent(tp, ip, &data);
+ xfs_refcount_decrease_extent(tp, &data);
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+ -data.br_blockcount);
+ } else if (data.br_startblock == DELAYSTARTBLOCK) {
+ int done;
+
+ /*
+ * If the extent we're remapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = xfs_bunmapi(NULL, ip, data.br_startoff,
+ data.br_blockcount, 0, 1, &done);
+ if (error)
+ goto out_cancel;
+ ASSERT(done);
+ }
/* Free the CoW orphan record. */
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
@@ -684,7 +711,7 @@ xfs_reflink_end_cow_extent(
return error;
/* Update the caller about how much progress we made. */
- *end_fsb = del.br_startoff;
+ *offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
out_cancel:
@@ -712,7 +739,7 @@ xfs_reflink_end_cow(
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
/*
- * Walk backwards until we're out of the I/O range. The loop function
+ * Walk forwards until we've remapped the I/O range. The loop function
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
@@ -744,7 +771,7 @@ xfs_reflink_end_cow(
* blocks will be remapped.
*/
while (end_fsb > offset_fsb && !error)
- error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
+ error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
if (error)
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
@@ -1121,6 +1148,8 @@ xfs_reflink_remap_extent(
++iext_delta;
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto out_cancel;
@@ -1133,7 +1162,7 @@ xfs_reflink_remap_extent(
xfs_refcount_decrease_extent(tp, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
- xfs_filblks_t len = smap.br_blockcount;
+ int done;
/*
* If the extent we're unmapping is a delalloc reservation,
@@ -1141,10 +1170,11 @@ xfs_reflink_remap_extent(
* incore state. Dropping the delalloc reservation takes care
* of the quota reservation for us.
*/
- error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+ error = xfs_bunmapi(NULL, ip, smap.br_startoff,
+ smap.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
- ASSERT(len == 0);
+ ASSERT(done);
}
/*
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index a22b2d19ef91..fef92e02f3bb 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -35,6 +35,7 @@ STATIC void
xfs_rui_item_free(
struct xfs_rui_log_item *ruip)
{
+ kmem_free(ruip->rui_item.li_lv_shadow);
if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
kmem_free(ruip);
else
@@ -53,10 +54,11 @@ xfs_rui_release(
struct xfs_rui_log_item *ruip)
{
ASSERT(atomic_read(&ruip->rui_refcount) > 0);
- if (atomic_dec_and_test(&ruip->rui_refcount)) {
- xfs_trans_ail_delete(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_rui_item_free(ruip);
- }
+ if (!atomic_dec_and_test(&ruip->rui_refcount))
+ return;
+
+ xfs_trans_ail_delete(&ruip->rui_item, 0);
+ xfs_rui_item_free(ruip);
}
STATIC void
@@ -227,14 +229,24 @@ xfs_rud_item_release(
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
xfs_rui_release(rudp->rud_ruip);
+ kmem_free(rudp->rud_item.li_lv_shadow);
kmem_cache_free(xfs_rud_cache, rudp);
}
+static struct xfs_log_item *
+xfs_rud_item_intent(
+ struct xfs_log_item *lip)
+{
+ return &RUD_ITEM(lip)->rud_ruip->rui_item;
+}
+
static const struct xfs_item_ops xfs_rud_item_ops = {
- .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED,
+ .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED |
+ XFS_ITEM_INTENT_DONE,
.iop_size = xfs_rud_item_size,
.iop_format = xfs_rud_item_format,
.iop_release = xfs_rud_item_release,
+ .iop_intent = xfs_rud_item_intent,
};
static struct xfs_rud_log_item *
@@ -327,7 +339,7 @@ xfs_trans_log_finish_rmap_update(
* 1.) releases the RUI and frees the RUD
* 2.) shuts down the filesystem
*/
- tp->t_flags |= XFS_TRANS_DIRTY;
+ tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
return error;
@@ -630,6 +642,7 @@ xfs_rui_item_relog(
}
static const struct xfs_item_ops xfs_rui_item_ops = {
+ .flags = XFS_ITEM_INTENT,
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
.iop_unpin = xfs_rui_item_unpin,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b8c79ee791af..292d5e54a92c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -806,6 +806,9 @@ xfs_growfs_rt_alloc(
error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
XFS_IEXT_ADD_NOSPLIT_CNT);
+ if (error == -EFBIG)
+ error = xfs_iext_count_upgrade(tp, ip,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
if (error)
goto out_trans_cancel;
@@ -1284,6 +1287,44 @@ xfs_rtmount_init(
return 0;
}
+static int
+xfs_rtalloc_count_frextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ uint64_t *valp = priv;
+
+ *valp += rec->ar_extcount;
+ return 0;
+}
+
+/*
+ * Reinitialize the number of free realtime extents from the realtime bitmap.
+ * Callers must ensure that there is no other activity in the filesystem.
+ */
+int
+xfs_rtalloc_reinit_frextents(
+ struct xfs_mount *mp)
+{
+ uint64_t val = 0;
+ int error;
+
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
+ &val);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_frextents = val;
+ spin_unlock(&mp->m_sb_lock);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ return 0;
+}
+
/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 91b00289509b..62c7ad79cbb6 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -22,6 +22,7 @@ struct xfs_rtalloc_rec {
};
typedef int (*xfs_rtalloc_query_range_fn)(
+ struct xfs_mount *mp,
struct xfs_trans *tp,
const struct xfs_rtalloc_rec *rec,
void *priv);
@@ -123,27 +124,29 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-int xfs_rtalloc_query_range(struct xfs_trans *tp,
+int xfs_rtalloc_query_range(struct xfs_mount *mp, struct xfs_trans *tp,
const struct xfs_rtalloc_rec *low_rec,
const struct xfs_rtalloc_rec *high_rec,
xfs_rtalloc_query_range_fn fn, void *priv);
-int xfs_rtalloc_query_all(struct xfs_trans *tp,
+int xfs_rtalloc_query_all(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtalloc_query_range_fn fn,
void *priv);
bool xfs_verify_rtbno(struct xfs_mount *mp, xfs_rtblock_t rtbno);
int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_rtblock_t start, xfs_extlen_t len,
bool *is_free);
+int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
# define xfs_growfs_rt(mp,in) (ENOSYS)
# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS)
-# define xfs_rtalloc_query_all(t,f,p) (ENOSYS)
+# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS)
# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS)
# define xfs_verify_rtbno(m, r) (false)
# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS)
+# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
xfs_rtmount_init(
xfs_mount_t *mp) /* file system mount structure */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index a276b8111f63..8495ef076ffc 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -843,9 +843,11 @@ xfs_fs_statfs(
if (XFS_IS_REALTIME_MOUNT(mp) &&
(ip->i_diflags & (XFS_DIFLAG_RTINHERIT | XFS_DIFLAG_REALTIME))) {
+ s64 freertx;
+
statp->f_blocks = sbp->sb_rblocks;
- statp->f_bavail = statp->f_bfree =
- sbp->sb_frextents * sbp->sb_rextsize;
+ freertx = percpu_counter_sum_positive(&mp->m_frextents);
+ statp->f_bavail = statp->f_bfree = freertx * sbp->sb_rextsize;
}
return 0;
@@ -1015,8 +1017,14 @@ xfs_init_percpu_counters(
if (error)
goto free_fdblocks;
+ error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
+ if (error)
+ goto free_delalloc;
+
return 0;
+free_delalloc:
+ percpu_counter_destroy(&mp->m_delalloc_blks);
free_fdblocks:
percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
@@ -1033,6 +1041,7 @@ xfs_reinit_percpu_counters(
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
+ percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
}
static void
@@ -1045,6 +1054,7 @@ xfs_destroy_percpu_counters(
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
+ percpu_counter_destroy(&mp->m_frextents);
}
static int
@@ -1635,6 +1645,10 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_large_extent_counts(mp))
+ xfs_warn(mp,
+ "EXPERIMENTAL Large extent counts feature in use. Use at your own risk!");
+
error = xfs_mountfs(mp);
if (error)
goto out_filestream_unmount;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index affbedf78160..4145ba872547 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -226,11 +226,6 @@ xfs_symlink(
goto out_trans_cancel;
}
- error = xfs_iext_count_may_overflow(dp, XFS_DATA_FORK,
- XFS_IEXT_DIR_MANIP_CNT(mp));
- if (error)
- goto out_trans_cancel;
-
/*
* Allocate an inode for the symlink.
*/
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 7692e76ead33..f78ad6b10ea5 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -83,6 +83,7 @@ extern xfs_param_t xfs_params;
struct xfs_globals {
#ifdef DEBUG
int pwork_threads; /* parallel workqueue threads */
+ bool larp; /* log attribute replay */
#endif
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 574b80c29fe1..f7faf6e70d7f 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -228,6 +228,29 @@ pwork_threads_show(
return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
+
+static ssize_t
+larp_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ ssize_t ret;
+
+ ret = kstrtobool(buf, &xfs_globals.larp);
+ if (ret < 0)
+ return ret;
+ return count;
+}
+
+STATIC ssize_t
+larp_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
+}
+XFS_SYSFS_ATTR_RW(larp);
#endif /* DEBUG */
static struct attribute *xfs_dbg_attrs[] = {
@@ -237,6 +260,7 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(always_cow),
#ifdef DEBUG
ATTR_LIST(pwork_threads),
+ ATTR_LIST(larp),
#endif
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b141ef78c755..d32026585c1b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -418,6 +418,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__field(unsigned, lockval)
__field(unsigned, flags)
__field(unsigned long, caller_ip)
+ __field(const void *, buf_ops)
),
TP_fast_assign(
__entry->dev = bp->b_target->bt_dev;
@@ -428,9 +429,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags;
__entry->caller_ip = caller_ip;
+ __entry->buf_ops = bp->b_ops;
),
TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d "
- "lock %d flags %s caller %pS",
+ "lock %d flags %s bufops %pS caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->bno,
__entry->nblks,
@@ -438,6 +440,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->pincount,
__entry->lockval,
__print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+ __entry->buf_ops,
(void *)__entry->caller_ip)
)
@@ -1096,22 +1099,6 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_before);
DEFINE_DQUOT_EVENT(xfs_trans_apply_dquot_deltas_after);
-#define XFS_QMOPT_FLAGS \
- { XFS_QMOPT_UQUOTA, "UQUOTA" }, \
- { XFS_QMOPT_PQUOTA, "PQUOTA" }, \
- { XFS_QMOPT_FORCE_RES, "FORCE_RES" }, \
- { XFS_QMOPT_SBVERSION, "SBVERSION" }, \
- { XFS_QMOPT_GQUOTA, "GQUOTA" }, \
- { XFS_QMOPT_INHERIT, "INHERIT" }, \
- { XFS_QMOPT_RES_REGBLKS, "RES_REGBLKS" }, \
- { XFS_QMOPT_RES_RTBLKS, "RES_RTBLKS" }, \
- { XFS_QMOPT_BCOUNT, "BCOUNT" }, \
- { XFS_QMOPT_ICOUNT, "ICOUNT" }, \
- { XFS_QMOPT_RTBCOUNT, "RTBCOUNT" }, \
- { XFS_QMOPT_DELBCOUNT, "DELBCOUNT" }, \
- { XFS_QMOPT_DELRTBCOUNT, "DELRTBCOUNT" }, \
- { XFS_QMOPT_RES_INOS, "RES_INOS" }
-
TRACE_EVENT(xfs_trans_mod_dquot,
TP_PROTO(struct xfs_trans *tp, struct xfs_dquot *dqp,
unsigned int field, int64_t delta),
@@ -1348,6 +1335,9 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
+DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
DECLARE_EVENT_CLASS(xfs_ail_class,
TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1924,7 +1914,7 @@ DECLARE_EVENT_CLASS(xfs_da_class,
__field(int, namelen)
__field(xfs_dahash_t, hashval)
__field(xfs_ino_t, inumber)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -1990,7 +1980,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(xfs_dahash_t, hashval)
__field(unsigned int, attr_filter)
__field(unsigned int, attr_flags)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
),
TP_fast_assign(
__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
@@ -2097,7 +2087,7 @@ DECLARE_EVENT_CLASS(xfs_dir2_space_class,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, idx)
),
TP_fast_assign(
@@ -2128,7 +2118,7 @@ TRACE_EVENT(xfs_dir2_leafn_moveents,
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
- __field(int, op_flags)
+ __field(uint32_t, op_flags)
__field(int, src_idx)
__field(int, dst_idx)
__field(int, count)
@@ -2169,7 +2159,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__field(int, which)
__field(xfs_ino_t, ino)
__field(int, format)
- __field(int, nex)
+ __field(xfs_extnum_t, nex)
__field(int, broot_size)
__field(int, fork_off)
),
@@ -2182,7 +2172,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
__entry->broot_size = ip->i_df.if_broot_bytes;
__entry->fork_off = XFS_IFORK_BOFF(ip);
),
- TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+ TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %llu, "
"broot size %d, forkoff 0x%x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
@@ -3418,7 +3408,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -3513,7 +3504,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
-TRACE_EVENT(xfs_trans_resv_calc,
+DECLARE_EVENT_CLASS(xfs_trans_resv_class,
TP_PROTO(struct xfs_mount *mp, unsigned int type,
struct xfs_trans_res *res),
TP_ARGS(mp, type, res),
@@ -3537,6 +3528,33 @@ TRACE_EVENT(xfs_trans_resv_calc,
__entry->logres,
__entry->logcount,
__entry->logflags)
+)
+
+#define DEFINE_TRANS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_trans_resv_class, name, \
+ TP_PROTO(struct xfs_mount *mp, unsigned int type, \
+ struct xfs_trans_res *res), \
+ TP_ARGS(mp, type, res))
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
+DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);
+
+TRACE_EVENT(xfs_log_get_max_trans_res,
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
+ TP_ARGS(mp, res),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint, logres)
+ __field(int, logcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->logres = res->tr_logres;
+ __entry->logcount = res->tr_logcount;
+ ),
+ TP_printk("dev %d:%d logres %u logcount %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->logres,
+ __entry->logcount)
);
DECLARE_EVENT_CLASS(xfs_trans_class,
@@ -4111,6 +4129,27 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);
+TRACE_DEFINE_ENUM(XFS_DAS_UNINIT);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_SF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ADD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_LEAF_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_SET_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_ALLOC_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REPLACE);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_OLD);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_RMT);
+TRACE_DEFINE_ENUM(XFS_DAS_NODE_REMOVE_ATTR);
+TRACE_DEFINE_ENUM(XFS_DAS_DONE);
+
DECLARE_EVENT_CLASS(xfs_das_state_class,
TP_PROTO(int das, struct xfs_inode *ip),
TP_ARGS(das, ip),
@@ -4122,8 +4161,9 @@ DECLARE_EVENT_CLASS(xfs_das_state_class,
__entry->das = das;
__entry->ino = ip->i_ino;
),
- TP_printk("state change %d ino 0x%llx",
- __entry->das, __entry->ino)
+ TP_printk("state change %s ino 0x%llx",
+ __print_symbolic(__entry->das, XFS_DAS_STRINGS),
+ __entry->ino)
)
#define DEFINE_DAS_STATE_EVENT(name) \
@@ -4132,9 +4172,15 @@ DEFINE_EVENT(xfs_das_state_class, name, \
TP_ARGS(das, ip))
DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
+DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
+
TRACE_EVENT(xfs_force_shutdown,
TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 0ac717aad380..82cf0189c0db 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -32,7 +32,6 @@ static void
xfs_trans_trace_reservations(
struct xfs_mount *mp)
{
- struct xfs_trans_res resv;
struct xfs_trans_res *res;
struct xfs_trans_res *end_res;
int i;
@@ -41,8 +40,6 @@ xfs_trans_trace_reservations(
end_res = (struct xfs_trans_res *)(M_RES(mp) + 1);
for (i = 0; res < end_res; i++, res++)
trace_xfs_trans_resv_calc(mp, i, res);
- xfs_log_get_max_trans_res(mp, &resv);
- trace_xfs_trans_resv_calc(mp, -1, &resv);
}
#else
# define xfs_trans_trace_reservations(mp)
@@ -194,11 +191,9 @@ xfs_trans_reserve(
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
error = xfs_log_regrant(mp, tp->t_ticket);
} else {
- error = xfs_log_reserve(mp,
- resp->tr_logres,
+ error = xfs_log_reserve(mp, resp->tr_logres,
resp->tr_logcount,
- &tp->t_ticket, XFS_TRANSACTION,
- permanent);
+ &tp->t_ticket, permanent);
}
if (error)
@@ -498,10 +493,31 @@ xfs_trans_apply_sb_deltas(
be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta);
}
- if (tp->t_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta);
- if (tp->t_res_frextents_delta)
- be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta);
+ /*
+ * Updating frextents requires careful handling because it does not
+ * behave like the lazysb counters because we cannot rely on log
+ * recovery in older kenels to recompute the value from the rtbitmap.
+ * This means that the ondisk frextents must be consistent with the
+ * rtbitmap.
+ *
+ * Therefore, log the frextents change to the ondisk superblock and
+ * update the incore superblock so that future calls to xfs_log_sb
+ * write the correct value ondisk.
+ *
+ * Don't touch m_frextents because it includes incore reservations,
+ * and those are handled by the unreserve function.
+ */
+ if (tp->t_frextents_delta || tp->t_res_frextents_delta) {
+ struct xfs_mount *mp = tp->t_mountp;
+ int64_t rtxdelta;
+
+ rtxdelta = tp->t_frextents_delta + tp->t_res_frextents_delta;
+
+ spin_lock(&mp->m_sb_lock);
+ be64_add_cpu(&sbp->sb_frextents, rtxdelta);
+ mp->m_sb.sb_frextents += rtxdelta;
+ spin_unlock(&mp->m_sb_lock);
+ }
if (tp->t_dblocks_delta) {
be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta);
@@ -614,7 +630,12 @@ xfs_trans_unreserve_and_mod_sb(
if (ifreedelta)
percpu_counter_add(&mp->m_ifree, ifreedelta);
- if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
+ if (rtxdelta) {
+ error = xfs_mod_frextents(mp, rtxdelta);
+ ASSERT(!error);
+ }
+
+ if (!(tp->t_flags & XFS_TRANS_SB_DIRTY))
return;
/* apply remaining deltas */
@@ -622,7 +643,12 @@ xfs_trans_unreserve_and_mod_sb(
mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
mp->m_sb.sb_icount += idelta;
mp->m_sb.sb_ifree += ifreedelta;
- mp->m_sb.sb_frextents += rtxdelta;
+ /*
+ * Do not touch sb_frextents here because we are dealing with incore
+ * reservation. sb_frextents is not part of the lazy sb counters so it
+ * must be consistent with the ondisk rtbitmap and must never include
+ * incore reservations.
+ */
mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
mp->m_sb.sb_agcount += tp->t_agcount_delta;
mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0c82673238f4..9561f193e7e1 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -55,13 +55,15 @@ struct xfs_log_item {
#define XFS_LI_IN_AIL 0
#define XFS_LI_ABORTED 1
#define XFS_LI_FAILED 2
-#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
+#define XFS_LI_DIRTY 3
+#define XFS_LI_WHITEOUT 4
#define XFS_LI_FLAGS \
- { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
- { (1 << XFS_LI_ABORTED), "ABORTED" }, \
- { (1 << XFS_LI_FAILED), "FAILED" }, \
- { (1 << XFS_LI_DIRTY), "DIRTY" }
+ { (1u << XFS_LI_IN_AIL), "IN_AIL" }, \
+ { (1u << XFS_LI_ABORTED), "ABORTED" }, \
+ { (1u << XFS_LI_FAILED), "FAILED" }, \
+ { (1u << XFS_LI_DIRTY), "DIRTY" }, \
+ { (1u << XFS_LI_WHITEOUT), "WHITEOUT" }
struct xfs_item_ops {
unsigned flags;
@@ -78,30 +80,32 @@ struct xfs_item_ops {
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
struct xfs_trans *tp);
+ struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
};
-/* Is this log item a deferred action intent? */
+/*
+ * Log item ops flags
+ */
+/*
+ * Release the log item when the journal commits instead of inserting into the
+ * AIL for writeback tracking and/or log tail pinning.
+ */
+#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
+#define XFS_ITEM_INTENT (1 << 1)
+#define XFS_ITEM_INTENT_DONE (1 << 2)
+
static inline bool
xlog_item_is_intent(struct xfs_log_item *lip)
{
- return lip->li_ops->iop_recover != NULL &&
- lip->li_ops->iop_match != NULL;
+ return lip->li_ops->flags & XFS_ITEM_INTENT;
}
-/* Is this a log intent-done item? */
static inline bool
xlog_item_is_intent_done(struct xfs_log_item *lip)
{
- return lip->li_ops->iop_unpin == NULL &&
- lip->li_ops->iop_push == NULL;
+ return lip->li_ops->flags & XFS_ITEM_INTENT_DONE;
}
-/*
- * Release the log item as soon as committed. This is for items just logging
- * intents that never need to be written back in place.
- */
-#define XFS_ITEM_RELEASE_WHEN_COMMITTED (1 << 0)
-
void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
int type, const struct xfs_item_ops *ops);
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 9ba7e6b9bed3..aa00cf67ad72 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -597,13 +597,11 @@ xfs_dqresv_check(
if (softlimit && total_count > softlimit) {
time64_t now = ktime_get_real_seconds();
- if ((res->timer != 0 && now > res->timer) ||
- (res->warnings != 0 && res->warnings >= qlim->warn)) {
+ if (res->timer != 0 && now > res->timer) {
*fatal = true;
return QUOTA_NL_ISOFTLONGWARN;
}
- res->warnings++;
return QUOTA_NL_ISOFTWARN;
}
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 0d050f8829ef..7a044afd4c46 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -12,9 +12,9 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
+#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_acl.h"
-#include "xfs_da_btree.h"
#include <linux/posix_acl_xattr.h>