summaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_reflink.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-08-07 10:57:29 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-08-07 10:57:29 -0700
commit5631c5e0eb9035d92ceb20fcd9cdb7779a3f5cc7 (patch)
tree4c293828f0220bb009d0fe18e66e209bef35e4ef /fs/xfs/xfs_reflink.c
parente51418191f5a741b5f94764798c81bf69dec4806 (diff)
parent818d5a91559ffe1e1f2095dcbbdb96c13fdb94ec (diff)
downloadlinux-5631c5e0eb9035d92ceb20fcd9cdb7779a3f5cc7.tar.bz2
Merge tag 'xfs-5.9-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
Pull xfs updates from Darrick Wong: "There are quite a few changes in this release, the most notable of which is that we've made inode flushing fully asynchronous, and we no longer block memory reclaim on this. Furthermore, we have fixed a long-standing bug in the quota code where soft limit warnings and inode limits were never tracked properly. Moving further down the line, the reflink control loops have been redesigned to behave more efficiently; and numerous small bugs have been fixed (see below). The xattr and quota code have been extensively refactored in preparation for more new features coming down the line. Finally, the behavior of DAX between ext4 and xfs has been stabilized, which gets us a step closer to removing the experimental tag from that feature. We have a few new contributors this time around. Welcome, all! I anticipate a second pull request next week for a few small bugfixes that have been trickling in, but this is it for big changes. Summary: - Fix some btree block pingponging problems when swapping extents - Redesign the reflink copy loop so that we only run one remapping operation per transaction. This helps us avoid running out of block reservation on highly deduped filesystems. - Take the MMAPLOCK around filemap_map_pages. - Make inode reclaim fully async so that we avoid stalling processes on flushing inodes to disk. - Reduce inode cluster buffer RMW cycles by attaching the buffer to dirty inodes so we won't let go of the cluster buffer when we know we're going to need it soon. - Add some more checks to the realtime bitmap file scrubber. - Don't trip false lockdep warnings in fs freeze. - Remove various redundant lines of code. - Remove unnecessary calls to xfs_perag_{get,put}. - Preserve I_VERSION state across remounts. - Fix an unmount hang due to AIL going to sleep with a non-empty delwri buffer list. - Fix an error in the inode allocation space reservation macro that caused regressions in generic/531. - Fix a potential livelock when dquot flush fails because the dquot buffer is locked. - Fix a miscalculation when reserving inode quota that could cause users to exceed a hardlimit. - Refactor struct xfs_dquot to use native types for incore fields instead of abusing the ondisk struct for this purpose. This will eventually enable proper y2038+ support, but for now it merely cleans up the quota function declarations. - Actually increment the quota softlimit warning counter so that soft failures turn into hard(er) failures when they exceed the softlimit warning counter limits set by the administrator. - Split incore dquot state flags into their own field and namespace, to avoid mixing them with quota type flags. - Create a new quota type flags namespace so that we can make it obvious when a quota function takes a quota type (user, group, project) as an argument. - Rename the ondisk dquot flags field to type, as that more accurately represents what we store in it. - Drop our bespoke memory allocation flags in favor of GFP_*. - Rearrange the xattr functions so that we no longer mix metadata updates and transaction management (e.g. rolling complex transactions) in the same functions. This work will prepare us for atomic xattr operations (itself a prerequisite for directory backrefs) in future release cycles. - Support FS_DAX_FL (aka FS_XFLAG_DAX) via GETFLAGS/SETFLAGS" * tag 'xfs-5.9-merge-7' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (117 commits) fs/xfs: Support that ioctl(SETXFLAGS/GETXFLAGS) can set/get inode DAX on XFS. xfs: Lift -ENOSPC handler from xfs_attr_leaf_addname xfs: Simplify xfs_attr_node_addname xfs: Simplify xfs_attr_leaf_addname xfs: Add helper function xfs_attr_node_removename_rmt xfs: Add helper function xfs_attr_node_removename_setup xfs: Add remote block helper functions xfs: Add helper function xfs_attr_leaf_mark_incomplete xfs: Add helpers xfs_attr_is_shortform and xfs_attr_set_shortform xfs: Remove xfs_trans_roll in xfs_attr_node_removename xfs: Remove unneeded xfs_trans_roll_inode calls xfs: Add helper function xfs_attr_node_shrink xfs: Pull up xfs_attr_rmtval_invalidate xfs: Refactor xfs_attr_rmtval_remove xfs: Pull up trans roll in xfs_attr3_leaf_clearflag xfs: Factor out xfs_attr_rmtval_invalidate xfs: Pull up trans roll from xfs_attr3_leaf_setflag xfs: Refactor xfs_attr_try_sf_addname xfs: Split apart xfs_attr_leaf_addname xfs: Pull up trans handling in xfs_attr3_leaf_flipflags ...
Diffstat (limited to 'fs/xfs/xfs_reflink.c')
-rw-r--r--fs/xfs/xfs_reflink.c355
1 files changed, 163 insertions, 192 deletions
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 107bf2a2f344..aac83f9d6107 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -179,7 +179,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
@@ -655,7 +655,7 @@ xfs_reflink_end_cow_extent(
* preallocations can leak into the range we are called upon, and we
* need to skip them.
*/
- if (!xfs_bmap_is_real_extent(&got)) {
+ if (!xfs_bmap_is_written_extent(&got)) {
*end_fsb = del.br_startoff;
goto out_cancel;
}
@@ -984,40 +984,28 @@ xfs_reflink_ag_has_free_space(
}
/*
- * Unmap a range of blocks from a file, then map other blocks into the hole.
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
- * The extent irec is mapped into dest at irec->br_startoff.
+ * Remap the given extent into the file. The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
*/
STATIC int
xfs_reflink_remap_extent(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *irec,
- xfs_fileoff_t destoff,
+ struct xfs_bmbt_irec *dmap,
xfs_off_t new_isize)
{
+ struct xfs_bmbt_irec smap;
struct xfs_mount *mp = ip->i_mount;
- bool real_extent = xfs_bmap_is_real_extent(irec);
struct xfs_trans *tp;
- unsigned int resblks;
- struct xfs_bmbt_irec uirec;
- xfs_filblks_t rlen;
- xfs_filblks_t unmap_len;
xfs_off_t newlen;
+ int64_t qres, qdelta;
+ unsigned int resblks;
+ bool smap_real;
+ bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ int nimaps;
int error;
- unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
- trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
-
- /* No reflinking if we're low on space */
- if (real_extent) {
- error = xfs_reflink_ag_has_free_space(mp,
- XFS_FSB_TO_AGNO(mp, irec->br_startblock));
- if (error)
- goto out;
- }
-
/* Start a rolling transaction to switch the mappings */
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out;
@@ -1025,87 +1013,147 @@ xfs_reflink_remap_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- /* If we're not just clearing space, then do we have enough quota? */
- if (real_extent) {
- error = xfs_trans_reserve_quota_nblks(tp, ip,
- irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
+ /*
+ * Read what's currently mapped in the destination file into smap.
+ * If smap isn't a hole, we will have to remove it before we can add
+ * dmap to the destination file.
+ */
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+ &smap, &nimaps, 0);
+ if (error)
+ goto out_cancel;
+ ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+ smap_real = xfs_bmap_is_real_extent(&smap);
+
+ /*
+ * We can only remap as many blocks as the smaller of the two extent
+ * maps, because we can only remap one extent at a time.
+ */
+ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+ ASSERT(dmap->br_blockcount == smap.br_blockcount);
+
+ trace_xfs_reflink_remap_extent_dest(ip, &smap);
+
+ /*
+ * Two extents mapped to the same physical block must not have
+ * different states; that's filesystem corruption. Move on to the next
+ * extent if they're both holes or both the same physical extent.
+ */
+ if (dmap->br_startblock == smap.br_startblock) {
+ if (dmap->br_state != smap.br_state)
+ error = -EFSCORRUPTED;
+ goto out_cancel;
+ }
+
+ /* If both extents are unwritten, leave them alone. */
+ if (dmap->br_state == XFS_EXT_UNWRITTEN &&
+ smap.br_state == XFS_EXT_UNWRITTEN)
+ goto out_cancel;
+
+ /* No reflinking if the AG of the dest mapping is low on space. */
+ if (dmap_written) {
+ error = xfs_reflink_ag_has_free_space(mp,
+ XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
if (error)
goto out_cancel;
}
- trace_xfs_reflink_remap(ip, irec->br_startoff,
- irec->br_blockcount, irec->br_startblock);
-
- /* Unmap the old blocks in the data fork. */
- rlen = unmap_len;
- while (rlen) {
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
- error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
+ /*
+ * Compute quota reservation if we think the quota block counter for
+ * this file could increase.
+ *
+ * Adding a written extent to the extent map can cause a bmbt split,
+ * and removing a mapped extent from the extent can cause a bmbt split.
+ * The two operations cannot both cause a split since they operate on
+ * the same index in the bmap btree, so we only need a reservation for
+ * one bmbt split if either thing is happening.
+ *
+ * If we are mapping a written extent into the file, we need to have
+ * enough quota block count reservation to handle the blocks in that
+ * extent. We log only the delta to the quota block counts, so if the
+ * extent we're unmapping also has blocks allocated to it, we don't
+ * need a quota reservation for the extent itself.
+ *
+ * Note that if we're replacing a delalloc reservation with a written
+ * extent, we have to take the full quota reservation because removing
+ * the delalloc reservation gives the block count back to the quota
+ * count. This is suboptimal, but the VFS flushed the dest range
+ * before we started. That should have removed all the delalloc
+ * reservations, but we code defensively.
+ */
+ qres = qdelta = 0;
+ if (smap_real || dmap_written)
+ qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ if (!smap_real && dmap_written)
+ qres += dmap->br_blockcount;
+ if (qres > 0) {
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
+ XFS_QMOPT_RES_REGBLKS);
if (error)
goto out_cancel;
+ }
+ if (smap_real) {
/*
- * Trim the extent to whatever got unmapped.
- * Remember, bunmapi works backwards.
+ * If the extent we're unmapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
*/
- uirec.br_startblock = irec->br_startblock + rlen;
- uirec.br_startoff = irec->br_startoff + rlen;
- uirec.br_blockcount = unmap_len - rlen;
- uirec.br_state = irec->br_state;
- unmap_len = rlen;
-
- /* If this isn't a real mapping, we're done. */
- if (!real_extent || uirec.br_blockcount == 0)
- goto next_extent;
+ xfs_bmap_unmap_extent(tp, ip, &smap);
+ xfs_refcount_decrease_extent(tp, &smap);
+ qdelta -= smap.br_blockcount;
+ } else if (smap.br_startblock == DELAYSTARTBLOCK) {
+ xfs_filblks_t len = smap.br_blockcount;
- trace_xfs_reflink_remap(ip, uirec.br_startoff,
- uirec.br_blockcount, uirec.br_startblock);
-
- /* Update the refcount tree */
- xfs_refcount_increase_extent(tp, &uirec);
-
- /* Map the new blocks into the data fork. */
- xfs_bmap_map_extent(tp, ip, &uirec);
+ /*
+ * If the extent we're unmapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+ if (error)
+ goto out_cancel;
+ ASSERT(len == 0);
+ }
- /* Update quota accounting. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
- uirec.br_blockcount);
+ /*
+ * If the extent we're sharing is backed by written storage, increase
+ * its refcount and map it into the file.
+ */
+ if (dmap_written) {
+ xfs_refcount_increase_extent(tp, dmap);
+ xfs_bmap_map_extent(tp, ip, dmap);
+ qdelta += dmap->br_blockcount;
+ }
- /* Update dest isize if needed. */
- newlen = XFS_FSB_TO_B(mp,
- uirec.br_startoff + uirec.br_blockcount);
- newlen = min_t(xfs_off_t, newlen, new_isize);
- if (newlen > i_size_read(VFS_I(ip))) {
- trace_xfs_reflink_update_inode_size(ip, newlen);
- i_size_write(VFS_I(ip), newlen);
- ip->i_d.di_size = newlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
-next_extent:
- /* Process all the deferred stuff. */
- error = xfs_defer_finish(&tp);
- if (error)
- goto out_cancel;
+ /* Update dest isize if needed. */
+ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+ newlen = min_t(xfs_off_t, newlen, new_isize);
+ if (newlen > i_size_read(VFS_I(ip))) {
+ trace_xfs_reflink_update_inode_size(ip, newlen);
+ i_size_write(VFS_I(ip), newlen);
+ ip->i_d.di_size = newlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+ /* Commit everything and unlock. */
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto out;
- return 0;
+ goto out_unlock;
out_cancel:
xfs_trans_cancel(tp);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
- trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+ if (error)
+ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
return error;
}
-/*
- * Iteratively remap one file's extents (and holes) to another's.
- */
+/* Remap a range of one file to the other. */
int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
@@ -1116,25 +1164,22 @@ xfs_reflink_remap_blocks(
loff_t *remapped)
{
struct xfs_bmbt_irec imap;
- xfs_fileoff_t srcoff;
- xfs_fileoff_t destoff;
+ struct xfs_mount *mp = src->i_mount;
+ xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
+ xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
xfs_filblks_t len;
- xfs_filblks_t range_len;
xfs_filblks_t remapped_len = 0;
xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
- destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
- srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
- len = XFS_B_TO_FSB(src->i_mount, remap_len);
+ len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+ XFS_MAX_FILEOFF);
- /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
- while (len) {
- uint lock_mode;
+ trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
- trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
- dest, destoff);
+ while (len > 0) {
+ unsigned int lock_mode;
/* Read extent from the source file */
nimaps = 1;
@@ -1143,18 +1188,25 @@ xfs_reflink_remap_blocks(
xfs_iunlock(src, lock_mode);
if (error)
break;
- ASSERT(nimaps == 1);
-
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
- &imap);
+ /*
+ * The caller supposedly flushed all dirty pages in the source
+ * file range, which means that writeback should have allocated
+ * or deleted all delalloc reservations in that range. If we
+ * find one, that's a good sign that something is seriously
+ * wrong here.
+ */
+ ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ error = -EFSCORRUPTED;
+ break;
+ }
- /* Translate imap into the destination file. */
- range_len = imap.br_startoff + imap.br_blockcount - srcoff;
- imap.br_startoff += destoff - srcoff;
+ trace_xfs_reflink_remap_extent_src(src, &imap);
- /* Clear dest from destoff to the end of imap and map it in. */
- error = xfs_reflink_remap_extent(dest, &imap, destoff,
- new_isize);
+ /* Remap into the destination file at the given offset. */
+ imap.br_startoff = destoff;
+ error = xfs_reflink_remap_extent(dest, &imap, new_isize);
if (error)
break;
@@ -1164,10 +1216,10 @@ xfs_reflink_remap_blocks(
}
/* Advance drange/srange */
- srcoff += range_len;
- destoff += range_len;
- len -= range_len;
- remapped_len += range_len;
+ srcoff += imap.br_blockcount;
+ destoff += imap.br_blockcount;
+ len -= imap.br_blockcount;
+ remapped_len += imap.br_blockcount;
}
if (error)
@@ -1178,81 +1230,6 @@ xfs_reflink_remap_blocks(
}
/*
- * Grab the exclusive iolock for a data copy from src to dest, making sure to
- * abide vfs locking order (lowest pointer value goes first) and breaking the
- * layout leases before proceeding. The loop is needed because we cannot call
- * the blocking break_layout() with the iolocks held, and therefore have to
- * back out both locks.
- */
-static int
-xfs_iolock_two_inodes_and_break_layout(
- struct inode *src,
- struct inode *dest)
-{
- int error;
-
- if (src > dest)
- swap(src, dest);
-
-retry:
- /* Wait to break both inodes' layouts before we start locking. */
- error = break_layout(src, true);
- if (error)
- return error;
- if (src != dest) {
- error = break_layout(dest, true);
- if (error)
- return error;
- }
-
- /* Lock one inode and make sure nobody got in and leased it. */
- inode_lock(src);
- error = break_layout(src, false);
- if (error) {
- inode_unlock(src);
- if (error == -EWOULDBLOCK)
- goto retry;
- return error;
- }
-
- if (src == dest)
- return 0;
-
- /* Lock the other inode and make sure nobody got in and leased it. */
- inode_lock_nested(dest, I_MUTEX_NONDIR2);
- error = break_layout(dest, false);
- if (error) {
- inode_unlock(src);
- inode_unlock(dest);
- if (error == -EWOULDBLOCK)
- goto retry;
- return error;
- }
-
- return 0;
-}
-
-/* Unlock both inodes after they've been prepped for a range clone. */
-void
-xfs_reflink_remap_unlock(
- struct file *file_in,
- struct file *file_out)
-{
- struct inode *inode_in = file_inode(file_in);
- struct xfs_inode *src = XFS_I(inode_in);
- struct inode *inode_out = file_inode(file_out);
- struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
-
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
- inode_unlock(inode_out);
- if (!same_inode)
- inode_unlock(inode_in);
-}
-
-/*
* If we're reflinking to a point past the destination file's EOF, we must
* zero any speculative post-EOF preallocations that sit between the old EOF
* and the destination file offset.
@@ -1314,18 +1291,12 @@ xfs_reflink_remap_prep(
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
- bool same_inode = (inode_in == inode_out);
- ssize_t ret;
+ int ret;
/* Lock both files against IO */
- ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
+ ret = xfs_ilock2_io_mmap(src, dest);
if (ret)
return ret;
- if (same_inode)
- xfs_ilock(src, XFS_MMAPLOCK_EXCL);
- else
- xfs_lock_two_inodes(src, XFS_MMAPLOCK_EXCL, dest,
- XFS_MMAPLOCK_EXCL);
/* Check file eligibility and prepare for block sharing. */
ret = -EINVAL;
@@ -1339,7 +1310,7 @@ xfs_reflink_remap_prep(
ret = generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
len, remap_flags);
- if (ret < 0 || *len == 0)
+ if (ret || *len == 0)
goto out_unlock;
/* Attach dquots to dest inode before changing block map */
@@ -1374,9 +1345,9 @@ xfs_reflink_remap_prep(
if (ret)
goto out_unlock;
- return 1;
+ return 0;
out_unlock:
- xfs_reflink_remap_unlock(file_in, file_out);
+ xfs_iunlock2_io_mmap(src, dest);
return ret;
}