From dc3ffbb14060c943469d5e12900db3a60bc3fa64 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 20 May 2020 13:17:10 -0700 Subject: xfs: gut error handling in xfs_trans_unreserve_and_mod_sb() xfs: gut error handling in xfs_trans_unreserve_and_mod_sb() From: Dave Chinner The error handling in xfs_trans_unreserve_and_mod_sb() is largely incorrect - rolling back the changes in the transaction if only one counter underruns makes all the other counters incorrect. We still allow the change to proceed and committing the transaction, except now we have multiple incorrect counters instead of a single underflow. Further, we don't actually report the error to the caller, so this is completely silent except on debug kernels that will assert on failure before we even get to the rollback code. Hence this error handling is broken, untested, and largely unnecessary complexity. Just remove it. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_trans.c | 163 +++++++---------------------------------------------- 1 file changed, 20 insertions(+), 143 deletions(-) (limited to 'fs/xfs/xfs_trans.c') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 28b983ff8b11..e4e29135ad1b 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -534,57 +534,9 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } -STATIC int -xfs_sb_mod8( - uint8_t *field, - int8_t delta) -{ - int8_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod32( - uint32_t *field, - int32_t delta) -{ - int32_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - -STATIC int -xfs_sb_mod64( - uint64_t *field, - int64_t delta) -{ - int64_t counter = *field; - - counter += delta; - if (counter < 0) { - ASSERT(0); - return -EINVAL; - } - *field = counter; - return 0; -} - /* - * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations - * and apply superblock counter changes to the in-core superblock. The + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and + * apply superblock counter changes to the in-core superblock. The * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT * applied to the in-core superblock. The idea is that that has already been * done. @@ -629,20 +581,17 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { error = xfs_mod_fdblocks(mp, blkdelta, rsvd); - if (error) - goto out; + ASSERT(!error); } if (idelta) { error = xfs_mod_icount(mp, idelta); - if (error) - goto out_undo_fdblocks; + ASSERT(!error); } if (ifreedelta) { error = xfs_mod_ifree(mp, ifreedelta); - if (error) - goto out_undo_icount; + ASSERT(!error); } if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) @@ -650,95 +599,23 @@ xfs_trans_unreserve_and_mod_sb( /* apply remaining deltas */ spin_lock(&mp->m_sb_lock); - if (rtxdelta) { - error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); - if (error) - goto out_undo_ifree; - } - - if (tp->t_dblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); - if (error) - goto out_undo_frextents; - } - if (tp->t_agcount_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); - if (error) - goto out_undo_dblocks; - } - if (tp->t_imaxpct_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); - if (error) - goto out_undo_agcount; - } - if (tp->t_rextsize_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, - tp->t_rextsize_delta); - if (error) - goto out_undo_imaxpct; - } - if (tp->t_rbmblocks_delta != 0) { - error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, - tp->t_rbmblocks_delta); - if (error) - goto out_undo_rextsize; - } - if (tp->t_rblocks_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); - if (error) - goto out_undo_rbmblocks; - } - if (tp->t_rextents_delta != 0) { - error = xfs_sb_mod64(&mp->m_sb.sb_rextents, - tp->t_rextents_delta); - if (error) - goto out_undo_rblocks; - } - if (tp->t_rextslog_delta != 0) { - error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, - tp->t_rextslog_delta); - if (error) - goto out_undo_rextents; - } + mp->m_sb.sb_frextents += rtxdelta; + mp->m_sb.sb_dblocks += tp->t_dblocks_delta; + mp->m_sb.sb_agcount += tp->t_agcount_delta; + mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta; + mp->m_sb.sb_rextsize += tp->t_rextsize_delta; + mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta; + mp->m_sb.sb_rblocks += tp->t_rblocks_delta; + mp->m_sb.sb_rextents += tp->t_rextents_delta; + mp->m_sb.sb_rextslog += tp->t_rextslog_delta; spin_unlock(&mp->m_sb_lock); - return; -out_undo_rextents: - if (tp->t_rextents_delta) - xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); -out_undo_rblocks: - if (tp->t_rblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); -out_undo_rbmblocks: - if (tp->t_rbmblocks_delta) - xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); -out_undo_rextsize: - if (tp->t_rextsize_delta) - xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); -out_undo_imaxpct: - if (tp->t_rextsize_delta) - xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); -out_undo_agcount: - if (tp->t_agcount_delta) - xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); -out_undo_dblocks: - if (tp->t_dblocks_delta) - xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); -out_undo_frextents: - if (rtxdelta) - xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); -out_undo_ifree: - spin_unlock(&mp->m_sb_lock); - if (ifreedelta) - xfs_mod_ifree(mp, -ifreedelta); -out_undo_icount: - if (idelta) - xfs_mod_icount(mp, -idelta); -out_undo_fdblocks: - if (blkdelta) - xfs_mod_fdblocks(mp, -blkdelta, rsvd); -out: - ASSERT(error == 0); + /* + * Debug checks outside of the spinlock so they don't lock up the + * machine if they fail. + */ + ASSERT(mp->m_sb.sb_imax_pct >= 0); + ASSERT(mp->m_sb.sb_rextslog >= 0); return; } -- cgit v1.2.3 From f18c9a9030972d892a244968c653aceb98e27c70 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 20 May 2020 13:17:11 -0700 Subject: xfs: reduce free inode accounting overhead Shaokun Zhang reported that XFS was using substantial CPU time in percpu_count_sum() when running a single threaded benchmark on a high CPU count (128p) machine from xfs_mod_ifree(). The issue is that the filesystem is empty when the benchmark runs, so inode allocation is running with a very low inode free count. With the percpu counter batching, this means comparisons when the counter is less that 128 * 256 = 32768 use the slow path of adding up all the counters across the CPUs, and this is expensive on high CPU count machines. The summing in xfs_mod_ifree() is only used to fire an assert if an underrun occurs. The error is ignored by the higher level code. Hence this is really just debug code and we don't need to run it on production kernels, nor do we need such debug checks to return error values just to trigger an assert. Finally, xfs_mod_icount/xfs_mod_ifree are only called from xfs_trans_unreserve_and_mod_sb(), so get rid of them and just directly call the percpu_counter_add/percpu_counter_compare functions. The compare functions are now run only on debug builds as they are internal to ASSERT() checks and so only compiled in when ASSERTs are active (CONFIG_XFS_DEBUG=y or CONFIG_XFS_WARN=y). Reported-by: Shaokun Zhang Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_mount.c | 33 --------------------------------- fs/xfs/xfs_mount.h | 2 -- fs/xfs/xfs_trans.c | 17 +++++++++++++---- 3 files changed, 13 insertions(+), 39 deletions(-) (limited to 'fs/xfs/xfs_trans.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bb91f04266b9..d5dcf9869860 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1189,39 +1189,6 @@ xfs_log_sbcount(xfs_mount_t *mp) return xfs_sync_sb(mp, true); } -/* - * Deltas for the inode count are +/-64, hence we use a large batch size - * of 128 so we don't need to take the counter lock on every update. - */ -#define XFS_ICOUNT_BATCH 128 -int -xfs_mod_icount( - struct xfs_mount *mp, - int64_t delta) -{ - percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH); - if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { - ASSERT(0); - percpu_counter_add(&mp->m_icount, -delta); - return -EINVAL; - } - return 0; -} - -int -xfs_mod_ifree( - struct xfs_mount *mp, - int64_t delta) -{ - percpu_counter_add(&mp->m_ifree, delta); - if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { - ASSERT(0); - percpu_counter_add(&mp->m_ifree, -delta); - return -EINVAL; - } - return 0; -} - /* * Deltas for the block count can vary from 1 to very large, but lock contention * only occurs on frequent small block count updates such as in the delayed diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index aba5a1579279..4835581f3eb0 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -392,8 +392,6 @@ extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); -extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index e4e29135ad1b..2222a0ed3155 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -545,7 +545,12 @@ xfs_trans_apply_sb_deltas( * used block counts are not updated in the on disk superblock. In this case, * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we * still need to update the incore superblock with the changes. + * + * Deltas for the inode count are +/-64, hence we use a large batch size of 128 + * so we don't need to take the counter lock on every update. */ +#define XFS_ICOUNT_BATCH 128 + void xfs_trans_unreserve_and_mod_sb( struct xfs_trans *tp) @@ -585,13 +590,17 @@ xfs_trans_unreserve_and_mod_sb( } if (idelta) { - error = xfs_mod_icount(mp, idelta); - ASSERT(!error); + percpu_counter_add_batch(&mp->m_icount, idelta, + XFS_ICOUNT_BATCH); + if (idelta < 0) + ASSERT(__percpu_counter_compare(&mp->m_icount, 0, + XFS_ICOUNT_BATCH) >= 0); } if (ifreedelta) { - error = xfs_mod_ifree(mp, ifreedelta); - ASSERT(!error); + percpu_counter_add(&mp->m_ifree, ifreedelta); + if (ifreedelta < 0) + ASSERT(percpu_counter_compare(&mp->m_ifree, 0) >= 0); } if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) -- cgit v1.2.3 From b41b46c20c0bd32cd0a3795fcd2b892213cb6f5e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 20 May 2020 13:17:11 -0700 Subject: xfs: remove the m_active_trans counter It's a global atomic counter, and we are hitting it at a rate of half a million transactions a second, so it's bouncing the counter cacheline all over the place on large machines. We don't actually need it anymore - it used to be required because the VFS freeze code could not track/prevent filesystem transactions that were running, but that problem no longer exists. Hence to remove the counter, we simply have to ensure that nothing calls xfs_sync_sb() while we are trying to quiesce the filesytem. That only happens if the log worker is still running when we call xfs_quiesce_attr(). The log worker is cancelled at the end of xfs_quiesce_attr() by calling xfs_log_quiesce(), so just call it early here and then we can remove the counter altogether. Concurrent create, 50 million inodes, identical 16p/16GB virtual machines on different physical hosts. Machine A has twice the CPU cores per socket of machine B: unpatched patched machine A: 3m16s 2m00s machine B: 4m04s 4m05s Create rates: unpatched patched machine A: 282k+/-31k 468k+/-21k machine B: 231k+/-8k 233k+/-11k Concurrent rm of same 50 million inodes: unpatched patched machine A: 6m42s 2m33s machine B: 4m47s 4m47s The transaction rate on the fast machine went from just under 300k/sec to 700k/sec, which indicates just how much of a bottleneck this atomic counter was. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_mount.h | 1 - fs/xfs/xfs_super.c | 17 +++++------------ fs/xfs/xfs_trans.c | 27 +++++++++++---------------- 3 files changed, 16 insertions(+), 29 deletions(-) (limited to 'fs/xfs/xfs_trans.c') diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c1f92c1847bb..3725d25ad97e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -176,7 +176,6 @@ typedef struct xfs_mount { uint64_t m_resblks; /* total reserved blocks */ uint64_t m_resblks_avail;/* available reserved blocks */ uint64_t m_resblks_save; /* reserved blks @ remount,ro */ - atomic_t m_active_trans; /* number trans frozen */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks trimming */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index aae469f73efe..fa58cb07c8fd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -874,8 +874,10 @@ xfs_restore_resvblks(struct xfs_mount *mp) * there is no log replay required to write the inodes to disk - this is the * primary difference between a sync and a quiesce. * - * Note: xfs_log_quiesce() stops background log work - the callers must ensure - * it is started again when appropriate. + * We cancel log work early here to ensure all transactions the log worker may + * run have finished before we clean up and log the superblock and write an + * unmount record. The unfreeze process is responsible for restarting the log + * worker correctly. */ void xfs_quiesce_attr( @@ -883,9 +885,7 @@ xfs_quiesce_attr( { int error = 0; - /* wait for all modifications to complete */ - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); + cancel_delayed_work_sync(&mp->m_log->l_work); /* force the log to unpin objects from the now complete transactions */ xfs_log_force(mp, XFS_LOG_SYNC); @@ -899,12 +899,6 @@ xfs_quiesce_attr( if (error) xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); - /* - * Just warn here till VFS can correctly support - * read-only remount without racing. - */ - WARN_ON(atomic_read(&mp->m_active_trans) != 0); - xfs_log_quiesce(mp); } @@ -1793,7 +1787,6 @@ static int xfs_init_fs_context( INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); spin_lock_init(&mp->m_perag_lock); mutex_init(&mp->m_growlock); - atomic_set(&mp->m_active_trans, 0); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2222a0ed3155..3c94e5ff4316 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -68,7 +68,6 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); - atomic_dec(&tp->t_mountp->m_active_trans); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); @@ -125,8 +124,6 @@ xfs_trans_dup( xfs_defer_move(ntp, tp); xfs_trans_dup_dqinfo(tp, ntp); - - atomic_inc(&tp->t_mountp->m_active_trans); return ntp; } @@ -275,7 +272,6 @@ xfs_trans_alloc( */ WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); - atomic_inc(&mp->m_active_trans); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -299,20 +295,19 @@ xfs_trans_alloc( /* * Create an empty transaction with no reservation. This is a defensive - * mechanism for routines that query metadata without actually modifying - * them -- if the metadata being queried is somehow cross-linked (think a - * btree block pointer that points higher in the tree), we risk deadlock. - * However, blocks grabbed as part of a transaction can be re-grabbed. - * The verifiers will notice the corrupt block and the operation will fail - * back to userspace without deadlocking. + * mechanism for routines that query metadata without actually modifying them -- + * if the metadata being queried is somehow cross-linked (think a btree block + * pointer that points higher in the tree), we risk deadlock. However, blocks + * grabbed as part of a transaction can be re-grabbed. The verifiers will + * notice the corrupt block and the operation will fail back to userspace + * without deadlocking. * - * Note the zero-length reservation; this transaction MUST be cancelled - * without any dirty data. + * Note the zero-length reservation; this transaction MUST be cancelled without + * any dirty data. * - * Callers should obtain freeze protection to avoid two conflicts with fs - * freezing: (1) having active transactions trip the m_active_trans ASSERTs; - * and (2) grabbing buffers at the same time that freeze is trying to drain - * the buffer LRU list. + * Callers should obtain freeze protection to avoid a conflict with fs freezing + * where we can be grabbing buffers at the same time that freeze is trying to + * drain the buffer LRU list. */ int xfs_trans_alloc_empty( -- cgit v1.2.3