diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 379 |
1 files changed, 241 insertions, 138 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 31ac8c682f19..d04c82c88418 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5,6 +5,7 @@ #include <linux/kernel.h> #include <linux/bio.h> +#include <linux/buffer_head.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -57,9 +58,9 @@ struct btrfs_iget_args { struct btrfs_dio_data { u64 reserve; - loff_t length; - ssize_t submitted; - struct extent_changeset *data_reserved; + u64 unsubmitted_oe_range_start; + u64 unsubmitted_oe_range_end; + int overwrite; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -4810,7 +4811,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); + /* Disable nonlocked read DIO to avoid the endless truncate */ + btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); + btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@ -7041,7 +7045,7 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, bool writing) + struct extent_state **cached_state, int writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7179,7 +7183,30 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, } +static int btrfs_get_blocks_direct_read(struct extent_map *em, + struct buffer_head *bh_result, + struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return -ENOENT; + + len = min(len, em->len - (start - em->start)); + + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; + set_buffer_mapped(bh_result); + + return 0; +} + static int btrfs_get_blocks_direct_write(struct extent_map **map, + struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7241,6 +7268,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ + len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(inode, start, len); if (IS_ERR(em)) { @@ -7251,73 +7279,64 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = fs_info->fs_devices->latest_bdev; + set_buffer_mapped(bh_result); + + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (start + len > i_size_read(inode)) + if (!dio_data->overwrite && start + len > i_size_read(inode)) i_size_write(inode, start + len); + WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; + current->journal_info = dio_data; out: return ret; } -static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, - loff_t length, unsigned flags, struct iomap *iomap, - struct iomap *srcmap) +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; + u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - const bool write = !!(flags & IOMAP_WRITE); + u64 len = bh_result->b_size; int ret = 0; - u64 len = length; - bool unlock_extents = false; - if (!write) + if (!create) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so we - * need to flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk. - */ - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - - dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); - if (!dio_data) - return -ENOMEM; - - dio_data->length = length; - if (write) { - dio_data->reserve = round_up(length, fs_info->sectorsize); - ret = btrfs_delalloc_reserve_space(inode, - &dio_data->data_reserved, - start, dio_data->reserve); - if (ret) { - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - return ret; - } + if (current->journal_info) { + /* + * Need to pull our outstanding extents and set journal_info to NULL so + * that anything that needs to check if there's a transaction doesn't get + * confused. + */ + dio_data = current->journal_info; + current->journal_info = NULL; } - iomap->private = dio_data; - /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, + create)) { ret = -ENOTBLK; goto err; } @@ -7349,47 +7368,35 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, goto unlock_err; } - len = min(len, em->len - (start - em->start)); - if (write) { - ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, len); + if (create) { + ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, + dio_data, start, len); if (ret < 0) goto unlock_err; - unlock_extents = true; - /* Recalc len in case the new em is smaller than requested */ - len = min(len, em->len - (start - em->start)); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); } else { + ret = btrfs_get_blocks_direct_read(em, bh_result, inode, + start, len); + /* Can be negative only if we read from a hole */ + if (ret < 0) { + ret = 0; + free_extent_map(em); + goto unlock_err; + } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + len; - if (lockstart < lockend) - unlock_extents = true; - } - - if (unlock_extents) - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - else - free_extent_state(cached_state); - - /* - * Translate extent map information to iomap. - * We trim the extents (and move the addr) even though iomap code does - * that, since we have locked only the parts we are performing I/O in. - */ - if ((em->block_start == EXTENT_MAP_HOLE) || - (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; - } else { - iomap->addr = em->block_start + (start - em->start); - iomap->type = IOMAP_MAPPED; + lockstart = start + bh_result->b_size; + if (lockstart < lockend) { + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + } else { + free_extent_state(cached_state); + } } - iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_bdev; - iomap->length = len; free_extent_map(em); @@ -7399,53 +7406,8 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) { - btrfs_delalloc_release_space(inode, dio_data->data_reserved, - start, dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); - extent_changeset_free(dio_data->data_reserved); - kfree(dio_data); - } - return ret; -} - -static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) -{ - int ret = 0; - struct btrfs_dio_data *dio_data = iomap->private; - size_t submitted = dio_data->submitted; - const bool write = !!(flags & IOMAP_WRITE); - - if (!write && (iomap->type == IOMAP_HOLE)) { - /* If reading from a hole, unlock and return */ - unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); - goto out; - } - - if (submitted < length) { - pos += submitted; - length -= submitted; - if (write) - __endio_write_update_ordered(inode, pos, length, false); - else - unlock_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1); - ret = -ENOTBLK; - } - - if (write) { - if (dio_data->reserve) - btrfs_delalloc_release_space(inode, - dio_data->data_reserved, pos, - dio_data->reserve, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); - extent_changeset_free(dio_data->data_reserved); - } -out: - kfree(dio_data); - iomap->private = NULL; - + if (dio_data) + current->journal_info = dio_data; return ret; } @@ -7468,7 +7430,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) dip->logical_offset + dip->bytes - 1); } - bio_endio(dip->dio_bio); + dio_end_io(dip->dio_bio); kfree(dip); } @@ -7704,11 +7666,24 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); + + if (write) { + struct btrfs_dio_data *dio_data = current->journal_info; + + /* + * Setting range start and end to the same value means that + * no cleanup will happen in btrfs_direct_IO + */ + dio_data->unsubmitted_oe_range_end = dip->logical_offset + + dip->bytes; + dio_data->unsubmitted_oe_range_start = + dio_data->unsubmitted_oe_range_end; + } return dip; } -static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, - struct bio *dio_bio, loff_t file_offset) +static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, + loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); @@ -7725,7 +7700,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iomap->private; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7734,8 +7708,8 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; - bio_endio(dio_bio); - return BLK_QC_T_NONE; + dio_end_io(dio_bio); + return; } if (!write && csum) { @@ -7806,27 +7780,156 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, goto out_err; } - dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; } while (submit_len > 0); - return BLK_QC_T_NONE; + return; out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); - return BLK_QC_T_NONE; } -const struct iomap_ops btrfs_dio_iomap_ops = { - .iomap_begin = btrfs_dio_iomap_begin, - .iomap_end = btrfs_dio_iomap_end, -}; +static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + const struct iov_iter *iter, loff_t offset) +{ + int seg; + int i; + unsigned int blocksize_mask = fs_info->sectorsize - 1; + ssize_t retval = -EINVAL; -const struct iomap_dio_ops btrfs_dops = { - .submit_io = btrfs_submit_direct, -}; + if (offset & blocksize_mask) + goto out; + + if (iov_iter_alignment(iter) & blocksize_mask) + goto out; + + /* If this is a write we don't need to check anymore */ + if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter)) + return 0; + /* + * Check to make sure we don't have duplicate iov_base's in this + * iovec, if so return EINVAL, otherwise we'll get csum errors + * when reading back. + */ + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + goto out; + } + } + retval = 0; +out: + return retval; +} + +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_dio_data dio_data = { 0 }; + struct extent_changeset *data_reserved = NULL; + loff_t offset = iocb->ki_pos; + size_t count = 0; + int flags = 0; + bool wakeup = true; + bool relock = false; + ssize_t ret; + + if (check_direct_IO(fs_info, iter, offset)) + return 0; + + inode_dio_begin(inode); + + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so + * we need to flush the dirty pages again to make absolutely sure + * that any outstanding dirty pages are on disk. + */ + count = iov_iter_count(iter); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_fdatawrite_range(inode->i_mapping, offset, + offset + count - 1); + + if (iov_iter_rw(iter) == WRITE) { + /* + * If the write DIO is beyond the EOF, we need update + * the isize, but it is protected by i_mutex. So we can + * not unlock the i_mutex at this case. + */ + if (offset + count <= inode->i_size) { + dio_data.overwrite = 1; + inode_unlock(inode); + relock = true; + } else if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; + } + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); + if (ret) + goto out; + + /* + * We need to know how many extents we reserved so that we can + * do the accounting properly if we go over the number we + * originally calculated. Abuse current->journal_info for this. + */ + dio_data.reserve = round_up(count, + fs_info->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; + current->journal_info = &dio_data; + down_read(&BTRFS_I(inode)->dio_sem); + } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags)) { + inode_dio_end(inode); + flags = DIO_LOCKING | DIO_SKIP_HOLES; + wakeup = false; + } + + ret = __blockdev_direct_IO(iocb, inode, + fs_info->fs_devices->latest_bdev, + iter, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (iov_iter_rw(iter) == WRITE) { + up_read(&BTRFS_I(inode)->dio_sem); + current->journal_info = NULL; + if (ret < 0 && ret != -EIOCBQUEUED) { + if (dio_data.reserve) + btrfs_delalloc_release_space(inode, data_reserved, + offset, dio_data.reserve, true); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + __endio_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + false); + } else if (ret >= 0 && (size_t)ret < count) + btrfs_delalloc_release_space(inode, data_reserved, + offset, count - (size_t)ret, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); + } +out: + if (wakeup) + inode_dio_end(inode); + if (relock) + inode_lock(inode); + + extent_changeset_free(data_reserved); + return ret; +} static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) @@ -10122,7 +10225,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = noop_direct_IO, + .direct_IO = btrfs_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION |