diff options
author | Josef Bacik <josef@toxicpanda.com> | 2020-09-03 11:16:51 -0400 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2020-10-07 12:06:57 +0200 |
commit | 0eb79294dbe328debae67961df28113141825d7b (patch) | |
tree | 07e0dc9ba3d74ab5e418a2b313265be2cce0af60 | |
parent | f85781fb505ec02891734e77f7c69a9c85c99ec3 (diff) | |
download | linux-0eb79294dbe328debae67961df28113141825d7b.tar.bz2 |
btrfs: dio iomap DSYNC workaround
iomap dio will run generic_write_sync() for us if the iocb is DSYNC.
This is problematic for us because of 2 reasons:
1. we hold the inode_lock() during this operation, and we take it in
generic_write_sync()
2. we hold a read lock on the dio_sem but take the write lock in fsync
Since we don't want to rip out this code right now, but reworking the
locking is a bit much to do at this point, work around this problem with
this masterpiece of a patch.
First, we clear DSYNC on the iocb so that the iomap stuff doesn't know
that it needs to handle the sync. We save this fact in
current->journal_info, because we need to see do special things once
we're in iomap_begin, and we have no way to pass private information
into iomap_dio_rw().
Next we specify a separate iomap_dio_ops for sync, which implements an
->end_io() callback that gets called when the dio completes. This is
important for AIO, because we really do need to run generic_write_sync()
if we complete asynchronously. However if we're still in the submitting
context when we enter ->end_io() we clear the flag so that the submitter
knows they're the ones that needs to run generic_write_sync().
This is meant to be temporary. We need to work out how to eliminate the
inode_lock() and the dio_sem in our fsync and use another mechanism to
protect these operations.
Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
-rw-r--r-- | fs/btrfs/file.c | 33 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 62 | ||||
-rw-r--r-- | fs/btrfs/transaction.h | 1 |
3 files changed, 94 insertions, 2 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b62679382799..4bf15846efdc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2023,7 +2023,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { + /* + * 1. We must always clear IOCB_DSYNC in order to not deadlock + * in iomap, as it calls generic_write_sync() in this case. + * 2. If we are async, we can call iomap_dio_complete() either + * in + * + * 2.1. A worker thread from the last bio completed. In this + * case we need to mark the btrfs_dio_data that it is + * async in order to call generic_write_sync() properly. + * This is handled by setting BTRFS_DIO_SYNC_STUB in the + * current->journal_info. + * 2.2 The submitter context, because all IO completed + * before we exited iomap_dio_rw(). In this case we can + * just re-set the IOCB_DSYNC on the iocb and we'll do + * the sync below. If our ->end_io() gets called and + * current->journal_info is set, then we know we're in + * our current context and we will clear + * current->journal_info to indicate that we need to + * sync below. + */ + if (sync) { + ASSERT(current->journal_info == NULL); + iocb->ki_flags &= ~IOCB_DSYNC; + current->journal_info = BTRFS_DIO_SYNC_STUB; + } num_written = __btrfs_direct_write(iocb, from); + + /* + * As stated above, we cleared journal_info, so we need to do + * the sync ourselves. + */ + if (sync && current->journal_info == NULL) + iocb->ki_flags |= IOCB_DSYNC; + current->journal_info = NULL; } else { num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4f0b1dbd3240..366d5b768f86 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,6 +62,7 @@ struct btrfs_dio_data { loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; + bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -7337,6 +7338,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; bool unlock_extents = false; + bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); + + /* + * We used current->journal_info here to see if we were sync, but + * there's a lot of tests in the enospc machinery to not do flushing if + * we have a journal_info set, so we need to clear this out and re-set + * it in iomap_end. + */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; if (!write) len = min_t(u64, len, fs_info->sectorsize); @@ -7362,6 +7374,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; + dio_data->sync = sync; dio_data->length = length; if (write) { dio_data->reserve = round_up(length, fs_info->sectorsize); @@ -7509,6 +7522,14 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, extent_changeset_free(dio_data->data_reserved); } out: + /* + * We're all done, we can re-set the current->journal_info now safely + * for our endio. + */ + if (dio_data->sync) { + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_DIO_SYNC_STUB; + } kfree(dio_data); iomap->private = NULL; @@ -7917,6 +7938,30 @@ out: return retval; } +static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned flags) +{ + /* + * Now if we're still in the context of our submitter we know we can't + * safely run generic_write_sync(), so clear our flag here so that the + * caller knows to follow up with a sync. + */ + if (current->journal_info == BTRFS_DIO_SYNC_STUB) { + current->journal_info = NULL; + return error; + } + + if (error) + return error; + + if (size) { + iocb->ki_flags |= IOCB_DSYNC; + return generic_write_sync(iocb, size); + } + + return 0; +} + static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, @@ -7926,6 +7971,11 @@ static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, }; +static const struct iomap_dio_ops btrfs_sync_dops = { + .submit_io = btrfs_submit_direct, + .end_io = btrfs_maybe_fsync_end_io, +}; + ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -7954,8 +8004,16 @@ ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) down_read(&BTRFS_I(inode)->dio_sem); } - ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - is_sync_kiocb(iocb)); + /* + * We have are actually a sync iocb, so we need our fancy endio to know + * if we need to sync. + */ + if (current->journal_info) + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8241c050ba71..858d9153a1cd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -112,6 +112,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) +#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; |