diff options
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 839 |
1 files changed, 446 insertions, 393 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 05e0c4a5affd..1372210869b1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -114,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + unsigned long *nr_written, int unlock, + u64 *done_offset); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate); - /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * @@ -195,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = page_offset(locked_page); - u64 page_end = page_start + PAGE_SIZE - 1; - + u64 page_start, page_end; struct page *page; + if (locked_page) { + page_start = page_offset(locked_page); + page_end = page_start + PAGE_SIZE - 1; + } + while (index <= end_index) { /* * For locked page, we will call end_extent_writepage() on it @@ -212,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + if (locked_page && index == (page_start >> PAGE_SHIFT)) { index++; continue; } @@ -223,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, /* * Here we just clear all Ordered bits for every page in the - * range, then __endio_write_update_ordered() will handle + * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, @@ -231,20 +230,23 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, put_page(page); } - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being instantiated - * then skip it, since the first page of a range is going to be - * properly cleaned up by the caller of run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + if (locked_page) { + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_start + PAGE_SIZE) + return; + /* + * In case this page belongs to the delalloc range being + * instantiated then skip it, since the first page of a range is + * going to be properly cleaned up by the caller of + * run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; + } } - return __endio_write_update_ordered(inode, offset, bytes, false); + return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -332,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = kmap_local_page(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); i++; ptr += cur_size; @@ -345,9 +347,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } else { page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); write_extent_buffer(leaf, kaddr, ptr, size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_page(page); } btrfs_mark_buffer_dirty(leaf); @@ -485,7 +487,7 @@ struct async_chunk { struct page *locked_page; u64 start; u64 end; - unsigned int write_flags; + blk_opf_t write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; @@ -560,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, * will unlock the full page. */ if (fs_info->sectorsize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(end + 1, PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(end + 1)) return 0; } @@ -678,8 +680,8 @@ again: * Thus we must also check against @actual_end, not just @end. */ if (blocksize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(round_up(actual_end, blocksize))) goto cleanup_and_bail_uncompressed; } @@ -920,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, * can directly submit them without interruption. */ ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0); + &nr_written, 0, NULL); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) { ret = 0; goto out; } if (ret < 0) { - if (locked_page) + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + if (locked_page) { + const u64 page_start = page_offset(locked_page); + const u64 page_end = page_start + PAGE_SIZE - 1; + + btrfs_page_set_error(inode->root->fs_info, locked_page, + page_start, PAGE_SIZE); + set_page_writeback(locked_page); + end_page_writeback(locked_page); + end_extent_writepage(locked_page, ret, page_start, page_end); unlock_page(locked_page); + } goto out; } @@ -1133,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * *page_started is set to one if we unlock locked_page and do everything * required to start IO on it. It may be clean and already done with * IO when we return. + * + * When unlock == 1, we unlock the pages in successfully allocated regions. + * When unlock == 0, we leave them locked for writing them out. + * + * However, we unlock all the pages except @locked_page in case of failure. + * + * In summary, page locking state will be as follow: + * + * - page_started == 1 (return value) + * - All the pages are unlocked. IO is started. + * - Note that this can happen only on success + * - unlock == 1 + * - All the pages except @locked_page are unlocked in any case + * - unlock == 0 + * - On success, all the pages are locked for writing out them + * - On failure, all the pages except @locked_page are unlocked + * + * When a failure happens in the second or later iteration of the + * while-loop, the ordered extents created in previous iterations are kept + * intact. So, the caller must clean them up by calling + * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for + * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock) + unsigned long *nr_written, int unlock, + u64 *done_offset) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; + u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; @@ -1329,18 +1365,62 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + /* + * If done_offset is non-NULL and ret == -EAGAIN, we expect the + * caller to write out the successfully allocated region and retry. + */ + if (done_offset && ret == -EAGAIN) { + if (orig_start < start) + *done_offset = start - 1; + else + *done_offset = start; + return ret; + } else if (ret == -EAGAIN) { + /* Convert to -ENOSPC since the caller cannot retry. */ + ret = -ENOSPC; + } + + /* + * Now, we have three regions to clean up: + * + * |-------(1)----|---(2)---|-------------(3)----------| + * `- orig_start `- start `- start + cur_alloc_size `- end + * + * We process each region below. + */ + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + /* - * If we reserved an extent for our delalloc range (or a subrange) and - * failed to create the respective ordered extent, then it means that - * when we reserved the extent we decremented the extent's size from - * the data space_info's bytes_may_use counter and incremented the - * space_info's bytes_reserved counter by the same amount. We must make - * sure extent_clear_unlock_delalloc() does not try to decrement again - * the data space_info's bytes_may_use counter, therefore we do not pass - * it the flag EXTENT_CLEAR_DATA_RESV. + * For the range (1). We have already instantiated the ordered extents + * for this region. They are cleaned up by + * btrfs_cleanup_ordered_extents() in e.g, + * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are + * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | + * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup + * function. + * + * However, in case of unlock == 0, we still need to unlock the pages + * (except @locked_page) to ensure all the pages are unlocked. + */ + if (!unlock && orig_start < start) { + if (!locked_page) + mapping_set_error(inode->vfs_inode.i_mapping, ret); + extent_clear_unlock_delalloc(inode, orig_start, start - 1, + locked_page, 0, page_ops); + } + + /* + * For the range (2). If we reserved an extent for our delalloc range + * (or a subrange) and failed to create the respective ordered extent, + * then it means that when we reserved the extent we decremented the + * extent's size from the data space_info's bytes_may_use counter and + * incremented the space_info's bytes_reserved counter by the same + * amount. We must make sure extent_clear_unlock_delalloc() does not try + * to decrement again the data space_info's bytes_may_use counter, + * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, @@ -1350,12 +1430,19 @@ out_unlock: page_ops); start += cur_alloc_size; if (start >= end) - goto out; + return ret; } + + /* + * For the range (3). We never touched the region. In addition to the + * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data + * space_info's bytes_may_use counter, reserved in + * btrfs_check_data_free_space(). + */ extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); - goto out; + return ret; } /* @@ -1435,7 +1522,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, int i; bool should_compress; unsigned nofs_flag; - const unsigned int write_flags = wbc_to_write_flags(wbc); + const blk_opf_t write_flags = wbc_to_write_flags(wbc); unlock_extent(&inode->io_tree, start, end); @@ -1538,19 +1625,41 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, u64 end, int *page_started, unsigned long *nr_written) { + u64 done_offset = end; int ret; + bool locked_page_done = false; - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0); - if (ret) - return ret; + while (start <= end) { + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0, &done_offset); + if (ret && ret != -EAGAIN) + return ret; - if (*page_started) - return 0; + if (*page_started) { + ASSERT(ret == 0); + return 0; + } + + if (ret == 0) + done_offset = end; + + if (done_offset == start) { + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + continue; + } + + if (!locked_page_done) { + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + } + locked_page_done = true; + extent_write_locked_range(&inode->vfs_inode, start, done_offset); + + start = done_offset + 1; + } - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; @@ -1642,7 +1751,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, } return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1); + nr_written, 1, NULL); } struct can_nocow_file_extent_args { @@ -2115,7 +2224,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page page_started, nr_written); else ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, @@ -2131,6 +2240,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 size; /* not delalloc, ignore it */ @@ -2138,7 +2248,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; size = orig->end - orig->start + 1; - if (size > BTRFS_MAX_EXTENT_SIZE) { + if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; @@ -2147,10 +2257,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * applies here, just in reverse. */ new_size = orig->end - split + 1; - num_extents = count_max_extents(new_size); + num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; - num_extents += count_max_extents(new_size); - if (count_max_extents(size) >= num_extents) + num_extents += count_max_extents(fs_info, new_size); + if (count_max_extents(fs_info, size) >= num_extents) return; } @@ -2167,6 +2277,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size, old_size; u32 num_extents; @@ -2180,7 +2291,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ - if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + if (new_size <= fs_info->max_extent_size) { spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); @@ -2206,10 +2317,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, * this case. */ old_size = other->end - other->start + 1; - num_extents = count_max_extents(old_size); + num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; - num_extents += count_max_extents(old_size); - if (count_max_extents(new_size) >= num_extents) + num_extents += count_max_extents(fs_info, old_size); + if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -2274,21 +2385,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits) + u32 bits) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); spin_lock(&BTRFS_I(inode)->lock); @@ -2303,7 +2414,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - if (*bits & EXTENT_DEFRAG) + if (bits & EXTENT_DEFRAG) BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) @@ -2312,7 +2423,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, } if (!(state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - state->start; @@ -2325,14 +2436,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * accounting happens. */ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, - struct extent_state *state, unsigned *bits) + struct extent_state *state, u32 bits) { struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); - if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); @@ -2343,7 +2454,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); @@ -2356,7 +2467,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * don't need to call delalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_CLEAR_META_RESV && + if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, false); @@ -2366,7 +2477,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && - (*bits & EXTENT_CLEAR_DATA_RESV)) + (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, @@ -2381,11 +2492,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } if ((state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; - if (*bits & EXTENT_ADD_INODE_BYTES) + if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } @@ -2580,95 +2691,78 @@ out: return errno_to_blk_status(ret); } -/* - * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read. - * - * Rules about async/sync submit, - * a) read: sync submit - * - * b) write without checksum: sync submit - * - * c) write with checksum: - * c-1) if bio is issued by fsync: sync submit - * (sync_writers != 0) - * - * c-2) if root is reloc root: sync submit - * (only in case of buffered IO) - * - * c-3) otherwise: async submit - */ -void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - blk_status_t ret = 0; - int skip_sum; - int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - - skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - - if (btrfs_is_free_space_inode(BTRFS_I(inode))) - metadata = BTRFS_WQ_ENDIO_FREE_SPACE; + struct btrfs_inode *bi = BTRFS_I(inode); + blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *page = bio_first_bvec_all(bio)->bv_page; - loff_t file_offset = page_offset(page); - - ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + ret = extract_ordered_extent(bi, bio, + page_offset(bio_first_bvec_all(bio)->bv_page)); if (ret) goto out; } - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); - if (ret) - goto out; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing - * the bio if there were any errors, so just return - * here. - */ - btrfs_submit_compressed_read(inode, bio, mirror_num); + /* + * If we need to checksum, and the I/O is not issued by fsync and + * friends, that is ->sync_writers != 0, defer the submission to a + * workqueue to parallelize it. + * + * Csum items for reloc roots have already been cloned at this point, + * so they are handled as part of the no-checksum case. + */ + if (!(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(bi->root)) { + if (!atomic_read(&bi->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + btrfs_submit_bio_start)) return; - } else { - /* - * Lookup bio sums does extra checks around whether we - * need to csum or not, which is why we ignore skip_sum - * here. - */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); - if (ret) - goto out; - } - goto mapit; - } else if (async && !skip_sum) { - /* csum items have already been cloned */ - if (btrfs_is_data_reloc_root(root)) - goto mapit; - /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, - 0, btrfs_submit_bio_start); - goto out; - } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); + + ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); if (ret) goto out; } + btrfs_submit_bio(fs_info, bio, mirror_num); + return; +out: + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + } +} -mapit: - ret = btrfs_map_bio(fs_info, bio, mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + blk_status_t ret; -out: + if (compress_type != BTRFS_COMPRESS_NONE) { + /* + * btrfs_submit_compressed_read will handle completing the bio + * if there were any errors, so just return here. + */ + btrfs_submit_compressed_read(inode, bio, mirror_num); + return; + } + + /* Save the original iter for read repair */ + btrfs_bio(bio)->iter = bio->bi_iter; + + /* + * Lookup bio sums does extra checks around whether we need to csum or + * not, which is why we ignore skip_sum here. + */ + ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) { bio->bi_status = ret; bio_endio(bio); + return; } + + btrfs_submit_bio(fs_info, bio, mirror_num); } /* @@ -3075,8 +3169,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - num_bytes = ram_bytes = oe->truncated_len; + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + num_bytes = oe->truncated_len; + ram_bytes = num_bytes; + } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3102,7 +3198,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3311,65 +3407,71 @@ out: return ret; } -static void finish_ordered_fn(struct btrfs_work *work) -{ - struct btrfs_ordered_extent *ordered_extent; - ordered_extent = container_of(work, struct btrfs_ordered_extent, work); - btrfs_finish_ordered_io(ordered_extent); -} - void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, - finish_ordered_fn, uptodate); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); +} + +/* + * Verify the checksum for a single sector without any extra action that depend + * on the type of I/O. + */ +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected) +{ + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + char *kaddr; + + ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); + + shash->tfm = fs_info->csum_shash; + + kaddr = kmap_local_page(page) + pgoff; + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + kunmap_local(kaddr); + + if (memcmp(csum, csum_expected, fs_info->csum_size)) + return -EIO; + return 0; } /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode - * @io_bio: btrfs_io_bio which contains the csum + * @bbio: btrfs_bio which contains the csum * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page - * @start: logical offset in the file * * The length of such check is always one sector size. + * + * When csum mismatch is detected, we will also report the error and fill the + * corrupted range with zero. (Thus it needs the extra parameters) */ -static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff, - u64 start) +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; u32 len = fs_info->sectorsize; - const u32 csum_size = fs_info->csum_size; - unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(pgoff + len <= PAGE_SIZE); - offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); - kaddr = kmap_atomic(page); - shash->tfm = fs_info->csum_shash; - - crypto_shash_digest(shash, kaddr + pgoff, len, csum); - kunmap_atomic(kaddr); - - if (memcmp(csum, csum_expected, csum_size)) + if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) goto zeroit; - return 0; + zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - bbio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), + bbio->file_offset + bio_offset, + csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -3401,11 +3503,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 pg_off; unsigned int result = 0; - if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { - btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); - return 0; - } - /* * This only happens for NODATASUM or compressed read. * Normally this should be covered by above check for compressed read @@ -3438,8 +3535,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, - page_offset(page) + pg_off); + ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; @@ -3578,7 +3674,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) u64 last_objectid = 0; int ret = 0, nr_unlink = 0; - /* Bail out if the cleanup is already running. */ if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; @@ -3661,17 +3756,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into - * fs_info->fs_roots. So here we can find if an + * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking - * up the root from that xarray. + * up the root from that radix tree. */ - spin_lock(&fs_info->fs_roots_lock); - dead_root = xa_load(&fs_info->fs_roots, - (unsigned long)found_key.objectid); + spin_lock(&fs_info->fs_roots_radix_lock); + dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; - spin_unlock(&fs_info->fs_roots_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ @@ -3911,7 +4006,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in the delayed_nodes xarray. + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -4229,7 +4324,7 @@ skip_backref: /* * If we are in a rename context, we don't need to update anything in the * log. That will be done later during the rename by btrfs_log_new_name(). - * Besides that, doing it here would only cause extra unncessary btree + * Besides that, doing it here would only cause extra unnecessary btree * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { @@ -4257,8 +4352,9 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = - dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; + dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4420,7 +4516,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); @@ -4859,7 +4956,6 @@ again: else memzero_page(page, (block_start - page_offset(page)) + offset, len); - flush_dcache_page(page); } btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); @@ -5062,9 +5158,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ if (newsize != oldsize) { inode_inc_iversion(inode); - if (!(mask & (ATTR_CTIME | ATTR_MTIME))) - inode->i_ctime = inode->i_mtime = - current_time(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } } if (newsize > oldsize) { @@ -5372,7 +5469,7 @@ void btrfs_evict_inode(struct inode *inode) if (!rsv) goto no_delete; rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5764,14 +5861,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, sub_root); + inode = new_simple_dir(dir->i_sb, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); - } - if (root != sub_root) btrfs_put_root(sub_root); - if (!IS_ERR(inode) && root != sub_root) { + if (IS_ERR(inode)) + return inode; + down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); @@ -6367,7 +6464,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(path); + /* + * We don't need the path anymore, plus inheriting properties, adding + * ACLs, security xattrs, orphan item or adding the link, will result in + * allocating yet another path. So just free our path. + */ + btrfs_free_path(path); + path = NULL; if (args->subvol) { struct inode *parent; @@ -6424,8 +6527,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - ret = 0; - goto out; + return 0; discard: /* @@ -7507,7 +7609,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - *map = em = em2; + *map = em2; + em = em2; } if (IS_ERR(em2)) { @@ -7589,8 +7692,26 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, const u64 data_alloc_len = length; bool unlock_extents = false; + /* + * We could potentially fault if we have a buffer > PAGE_SIZE, and if + * we're NOWAIT we may submit a bio for a partial range and return + * EIOCBQUEUED, which would result in an errant short read. + * + * The best way to handle this would be to allow for partial completions + * of iocb's, so we could submit the partial bio, return and fault in + * the rest of the pages, and then submit the io for the rest of the + * range. However we don't have that currently, so simply return + * -EAGAIN at this point so that the normal path is used. + */ + if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) + return -EAGAIN; + + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ if (!write) - len = min_t(u64, len, fs_info->sectorsize); + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); lockstart = start; lockend = start + len - 1; @@ -7681,7 +7802,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); - ret = -ENOTBLK; + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } @@ -7813,8 +7946,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos += submitted; length -= submitted; if (write) - __endio_write_update_ordered(BTRFS_I(inode), pos, - length, false); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); @@ -7836,10 +7969,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->file_offset, - dip->bytes, - !dip->bio.bi_status); + btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + dip->file_offset, dip->bytes, + !dip->bio.bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, @@ -7859,12 +7991,8 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, BUG_ON(bio_op(bio) == REQ_OP_WRITE); - if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA)) - return; - refcount_inc(&dip->refs); - if (btrfs_map_bio(fs_info, bio, mirror_num)) - refcount_dec(&dip->refs); + btrfs_submit_bio(fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, @@ -7873,56 +8001,35 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - struct bio_vec bvec; - struct bvec_iter iter; - u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (uptodate && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, + bv.bv_offset))) { + clean_io_failure(fs_info, failure_tree, io_tree, start, + bv.bv_page, btrfs_ino(BTRFS_I(inode)), + bv.bv_offset); + } else { + int ret; - __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec.bv_offset; - for (i = 0; i < nr_sectors; i++) { - u64 start = bbio->file_offset + bio_offset; - - ASSERT(pgoff < PAGE_SIZE); - if (uptodate && - (!csum || !check_data_csum(inode, bbio, - bio_offset, bvec.bv_page, - pgoff, start))) { - clean_io_failure(fs_info, failure_tree, io_tree, - start, bvec.bv_page, - btrfs_ino(BTRFS_I(inode)), - pgoff); - } else { - int ret; - - ret = btrfs_repair_one_sector(inode, &bbio->bio, - bio_offset, bvec.bv_page, pgoff, - start, bbio->mirror_num, - submit_dio_repair_bio); - if (ret) - err = errno_to_blk_status(ret); - } - ASSERT(bio_offset + sectorsize > bio_offset); - bio_offset += sectorsize; - pgoff += sectorsize; + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } } - return err; -} -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate) -{ - btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, - finish_ordered_fn, uptodate); + return err; } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -7957,51 +8064,43 @@ static void btrfs_end_dio_bio(struct bio *bio) btrfs_dio_private_put(dip); } -static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, - struct inode *inode, u64 file_offset, int async_submit) +static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; - /* Check btrfs_submit_bio_hook() for rules about async submit. */ - if (async_submit) - async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); - - if (!write) { - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - goto err; - } + /* Save the original iter for read repair */ + if (btrfs_op(bio) == BTRFS_MAP_READ) + btrfs_bio(bio)->iter = bio->bi_iter; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; - if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io); - goto err; - } else if (write) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + /* Check btrfs_submit_data_write_bio() for async submit rules */ + if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && + btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_submit_bio_start_direct_io)) + return; + /* * If we aren't doing async submit, calculate the csum of the * bio now. */ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); - if (ret) - goto err; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + return; + } } else { - u64 csum_offset; - - csum_offset = file_offset - dip->file_offset; - csum_offset >>= fs_info->sectorsize_bits; - csum_offset *= fs_info->csum_size; - btrfs_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, + file_offset - dip->file_offset); } map: - ret = btrfs_map_bio(fs_info, bio, 0); -err: - return ret; + btrfs_submit_bio(fs_info, bio, 0); } static void btrfs_submit_direct(const struct iomap_iter *iter, @@ -8114,14 +8213,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - status = btrfs_submit_dio_bio(bio, inode, file_offset, - async_submit); - if (status) { - bio_put(bio); - if (submit_len > 0) - refcount_dec(&dip->refs); - goto out_err_em; - } + btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; @@ -8154,7 +8246,8 @@ ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_befo struct btrfs_dio_data data; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, &data, done_before); + IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC, + &data, done_before); } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -8169,31 +8262,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - int ret; - - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - /* - * If we are under memory pressure we will call this directly from the - * VM, we need to make sure we have the inode referenced for the ordered - * extent. If not just return like we didn't do anything. - */ - if (!igrab(inode)) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - ret = extent_write_full_page(page, wbc); - btrfs_add_delayed_iput(inode); - return ret; -} - static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -8257,30 +8325,24 @@ static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) } #ifdef CONFIG_MIGRATION -static int btrfs_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, +static int btrfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - int ret; + int ret = filemap_migrate_folio(mapping, dst, src, mode); - ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) - attach_page_private(newpage, detach_page_private(page)); - - if (PageOrdered(page)) { - ClearPageOrdered(page); - SetPageOrdered(newpage); + if (folio_test_ordered(src)) { + folio_clear_ordered(src); + folio_set_ordered(dst); } - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } +#else +#define btrfs_migrate_folio NULL #endif static void btrfs_invalidate_folio(struct folio *folio, size_t offset, @@ -8497,7 +8559,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepage() function could + * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ @@ -8588,10 +8650,9 @@ again: else zero_start = PAGE_SIZE; - if (zero_start != PAGE_SIZE) { + if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - flush_dcache_page(page); - } + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); @@ -8674,7 +8735,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) if (!rsv) return -ENOMEM; rsv->size = min_size; - rsv->failfast = 1; + rsv->failfast = true; /* * 1 for the truncate slack space @@ -9195,8 +9256,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; + old_dir->i_mtime = ctime; + old_dir->i_ctime = ctime; + new_dir->i_mtime = ctime; + new_dir->i_ctime = ctime; old_inode->i_ctime = ctime; new_inode->i_ctime = ctime; @@ -9459,9 +9522,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - old_inode->i_ctime = current_time(old_dir); + old_dir->i_mtime = current_time(old_dir); + old_dir->i_ctime = old_dir->i_mtime; + new_dir->i_mtime = old_dir->i_mtime; + new_dir->i_ctime = old_dir->i_mtime; + old_inode->i_ctime = old_dir->i_mtime; if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9549,15 +9614,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int ret; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return btrfs_rename_exchange(old_dir, old_dentry, new_dir, - new_dentry); + ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + else + ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); + + btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); - return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, - new_dentry, flags); + return ret; } struct btrfs_delalloc_work { @@ -10177,9 +10248,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } -static int btrfs_encoded_io_compression_from_extent( - struct btrfs_fs_info *fs_info, - int compress_type) +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type) { switch (compress_type) { case BTRFS_COMPRESS_NONE: @@ -10302,7 +10372,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { struct btrfs_encoded_read_private *priv = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; @@ -10312,19 +10381,9 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, return ret; } - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) { - btrfs_bio_free_csum(bbio); - return ret; - } - atomic_inc(&priv->pending); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) { - atomic_dec(&priv->pending); - btrfs_bio_free_csum(bbio); - } - return ret; + btrfs_submit_bio(fs_info, bio, mirror_num); + return BLK_STS_OK; } static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) @@ -10336,7 +10395,6 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) u32 sectorsize = fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; - u64 start = priv->file_offset; u32 bio_offset = 0; if (priv->skip_csum || !uptodate) @@ -10349,10 +10407,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, - bvec->bv_page, pgoff, start)) + if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff)) return BLK_STS_IOERR; - start += sectorsize; bio_offset += sectorsize; pgoff += sectorsize; } @@ -10384,11 +10441,9 @@ static void btrfs_encoded_read_endio(struct bio *bio) bio_put(bio); } -static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, - u64 disk_bytenr, - u64 disk_io_size, - struct page **pages) +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { @@ -10619,7 +10674,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -ENOBUFS; goto out_em; } - disk_io_size = count = em->block_len; + disk_io_size = em->block_len; + count = em->block_len; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, @@ -10782,15 +10838,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = -ENOMEM; goto out_pages; } - kaddr = kmap(pages[i]); + kaddr = kmap_local_page(pages[i]); if (copy_from_iter(kaddr, bytes, from) != bytes) { - kunmap(pages[i]); + kunmap_local(kaddr); ret = -EFAULT; goto out_pages; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap(pages[i]); + kunmap_local(kaddr); } for (;;) { @@ -11419,15 +11475,12 @@ static const struct file_operations btrfs_dir_file_operations = { */ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, - .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, .invalidate_folio = btrfs_invalidate_folio, .release_folio = btrfs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = btrfs_migratepage, -#endif + .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, |