diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/compression.c | 17 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 26 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 2 | ||||
-rw-r--r-- | fs/btrfs/file-item.c | 108 | ||||
-rw-r--r-- | fs/btrfs/file.c | 4 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 19 | ||||
-rw-r--r-- | fs/btrfs/reflink.c | 38 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 37 | ||||
-rw-r--r-- | fs/btrfs/zoned.c | 23 |
9 files changed, 201 insertions, 73 deletions
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d17ac301032e..1346d698463a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -457,7 +457,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { int submit = 0; - int len; + int len = 0; page = compressed_pages[pg_index]; page->mapping = inode->vfs_inode.i_mapping; @@ -465,10 +465,17 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 0); - if (pg_index == 0 && use_append) - len = bio_add_zone_append_page(bio, page, PAGE_SIZE, 0); - else - len = bio_add_page(bio, page, PAGE_SIZE, 0); + /* + * Page can only be added to bio if the current bio fits in + * stripe. + */ + if (!submit) { + if (pg_index == 0 && use_append) + len = bio_add_zone_append_page(bio, page, + PAGE_SIZE, 0); + else + len = bio_add_page(bio, page, PAGE_SIZE, 0); + } page->mapping = NULL; if (submit || len < PAGE_SIZE) { diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c9a3036c23bf..8d386a5587ee 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2648,6 +2648,24 @@ static int validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, + BTRFS_FSID_SIZE)) { + btrfs_err(fs_info, + "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", + fs_info->super_copy->fsid, fs_info->fs_devices->fsid); + ret = -EINVAL; + } + + if (btrfs_fs_incompat(fs_info, METADATA_UUID) && + memcmp(fs_info->fs_devices->metadata_uuid, + fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { + btrfs_err(fs_info, +"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", + fs_info->super_copy->metadata_uuid, + fs_info->fs_devices->metadata_uuid); + ret = -EINVAL; + } + if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, @@ -3279,14 +3297,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device disk_super = fs_info->super_copy; - ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, - BTRFS_FSID_SIZE)); - - if (btrfs_fs_incompat(fs_info, METADATA_UUID)) { - ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid, - fs_info->super_copy->metadata_uuid, - BTRFS_FSID_SIZE)); - } features = btrfs_super_flags(disk_super); if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f1d15b68994a..3d5c35e4cb76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1868,7 +1868,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); - return 0; + return ret; } static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 294602f139ef..441cee7fbb62 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -788,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 end_byte = bytenr + len; u64 csum_end; struct extent_buffer *leaf; - int ret; + int ret = 0; const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; @@ -806,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -862,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, path->slots[0], del_nr); if (ret) - goto out; + break; if (key.offset == bytenr) break; } else if (key.offset < bytenr && csum_end > end_byte) { @@ -906,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { btrfs_abort_transaction(trans, ret); - goto out; + break; } + ret = 0; key.offset = end_byte - 1; } else { @@ -917,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - ret = 0; -out: btrfs_free_path(path); return ret; } +static int find_next_csum_offset(struct btrfs_root *root, + struct btrfs_path *path, + u64 *next_offset) +{ + const u32 nritems = btrfs_header_nritems(path->nodes[0]); + struct btrfs_key found_key; + int slot = path->slots[0] + 1; + int ret; + + if (nritems == 0 || slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *next_offset = (u64)-1; + return 0; + } + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) + *next_offset = (u64)-1; + else + *next_offset = found_key.offset; + + return 0; +} + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) @@ -938,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, u64 total_bytes = 0; u64 csum_offset; u64 bytenr; - u32 nritems; u32 ins_size; int index = 0; int found_next; @@ -981,26 +1011,10 @@ again: goto insert; } } else { - int slot = path->slots[0] + 1; - /* we didn't find a csum item, insert one */ - nritems = btrfs_header_nritems(path->nodes[0]); - if (!nritems || (path->slots[0] >= nritems - 1)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - found_next = 1; - goto insert; - } - slot = path->slots[0]; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); - if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - found_key.type != BTRFS_EXTENT_CSUM_KEY) { - found_next = 1; - goto insert; - } - next_offset = found_key.offset; + /* We didn't find a csum item, insert one. */ + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; found_next = 1; goto insert; } @@ -1056,8 +1070,48 @@ extend_csum: tmp = sums->len - total_bytes; tmp >>= fs_info->sectorsize_bits; WARN_ON(tmp < 1); + extend_nr = max_t(int, 1, tmp); + + /* + * A log tree can already have checksum items with a subset of + * the checksums we are trying to log. This can happen after + * doing a sequence of partial writes into prealloc extents and + * fsyncs in between, with a full fsync logging a larger subrange + * of an extent for which a previous fast fsync logged a smaller + * subrange. And this happens in particular due to merging file + * extent items when we complete an ordered extent for a range + * covered by a prealloc extent - this is done at + * btrfs_mark_extent_written(). + * + * So if we try to extend the previous checksum item, which has + * a range that ends at the start of the range we want to insert, + * make sure we don't extend beyond the start offset of the next + * checksum item. If we are at the last item in the leaf, then + * forget the optimization of extending and add a new checksum + * item - it is not worth the complexity of releasing the path, + * getting the first key for the next leaf, repeat the btree + * search, etc, because log trees are temporary anyway and it + * would only save a few bytes of leaf space. + */ + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (path->slots[0] + 1 >= + btrfs_header_nritems(path->nodes[0])) { + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + found_next = 1; + goto insert; + } + + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + + tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; + if (tmp <= INT_MAX) + extend_nr = min_t(int, extend_nr, tmp); + } - extend_nr = max_t(int, 1, (int)tmp); diff = (csum_offset + extend_nr) * csum_size; diff = min(diff, MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3b10d98b4ebb..55f68422061d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1094,7 +1094,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int del_nr = 0; int del_slot = 0; int recow; - int ret; + int ret = 0; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); @@ -1315,7 +1315,7 @@ again: } out: btrfs_free_path(path); - return 0; + return ret; } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 33f14573f2ec..46f392943f4d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3000,6 +3000,18 @@ out: if (ret || truncated) { u64 unwritten_start = start; + /* + * If we failed to finish this ordered extent for any reason we + * need to make sure BTRFS_ORDERED_IOERR is set on the ordered + * extent, and mark the inode with the error if it wasn't + * already set. Any error during writeback would have already + * set the mapping error, so we need to set it if we're the ones + * marking this ordered extent as failed. + */ + if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, + &ordered_extent->flags)) + mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + if (truncated) unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); @@ -9076,6 +9088,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; + bool need_abort = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) @@ -9135,6 +9148,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_idx); if (ret) goto out_fail; + need_abort = true; } /* And now for the dest. */ @@ -9150,8 +9164,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); - if (ret) + if (ret) { + if (need_abort) + btrfs_abort_transaction(trans, ret); goto out_fail; + } } /* Update inode version and ctime/mtime. */ diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index d434dc78dadf..9178da07cc9c 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -203,10 +203,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, - comp_type); - goto out; + goto copy_to_page; } } else if (i_size_read(dst) <= datal) { struct btrfs_file_extent_item *ei; @@ -222,13 +219,10 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, comp_type); - goto out; + goto copy_to_page; } copy_inline_extent: - ret = 0; /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. @@ -240,11 +234,13 @@ copy_inline_extent: * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, comp_type); - goto out; + goto copy_to_page; } + /* + * Release path before starting a new transaction so we don't hold locks + * that would confuse lockdep. + */ btrfs_release_path(path); /* * If we end up here it means were copy the inline extent into a leaf @@ -282,11 +278,6 @@ copy_inline_extent: out: if (!ret && !trans) { /* - * Release path before starting a new transaction so we don't - * hold locks that would confuse lockdep. - */ - btrfs_release_path(path); - /* * No transaction here means we copied the inline extent into a * page of the destination inode. * @@ -306,6 +297,21 @@ out: *trans_out = trans; return ret; + +copy_to_page: + /* + * Release our path because we don't need it anymore and also because + * copy_inline_to_page() needs to reserve data and metadata, which may + * need to flush delalloc when we are low on available space and + * therefore cause a deadlock if writeback of an inline extent needs to + * write to the same leaf or an ordered extent completion needs to write + * to the same leaf. + */ + btrfs_release_path(path); + + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); + goto out; } /** diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 326be57f2828..dbcf8bb2f3b9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; } ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; @@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; if (ret == 1) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + break; btrfs_release_path(path); inode = read_one_inode(root, key.offset); - if (!inode) - return -EIO; + if (!inode) { + ret = -EIO; + break; + } ret = fixup_inode_link_count(trans, root, inode); iput(inode); if (ret) - goto out; + break; /* * fixup on a directory may create new entries, @@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, */ key.offset = (u64)-1; } - ret = 0; -out: btrfs_release_path(path); return ret; } @@ -3297,6 +3302,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * begins and releases it only after writing its superblock. */ mutex_lock(&fs_info->tree_log_mutex); + + /* + * The previous transaction writeout phase could have failed, and thus + * marked the fs in an error state. We must not commit here, as we + * could have updated our generation in the super_for_commit and + * writing the super here would result in transid mismatches. If there + * is an error here just bail. + */ + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + ret = -EIO; + btrfs_set_log_full_commit(trans); + btrfs_abort_transaction(trans, ret); + mutex_unlock(&fs_info->tree_log_mutex); + goto out_wake_log_root; + } + btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); ret = write_all_supers(fs_info, 1); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1bb8ee97aae0..f1f3b10d1dbb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -150,6 +150,18 @@ static inline u32 sb_zone_number(int shift, int mirror) return (u32)zone; } +static inline sector_t zone_start_sector(u32 zone_number, + struct block_device *bdev) +{ + return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); +} + +static inline u64 zone_start_physical(u32 zone_number, + struct btrfs_zoned_device_info *zone_info) +{ + return (u64)zone_number << zone_info->zone_size_shift; +} + /* * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block * device into static sized chunks and fake a conventional zone on each of @@ -405,8 +417,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (sb_zone + 1 >= zone_info->nr_zones) continue; - sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT); - ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, + ret = btrfs_get_dev_zones(device, + zone_start_physical(sb_zone, zone_info), &zone_info->sb_zones[sb_pos], &nr_zones); if (ret) @@ -721,7 +733,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, if (sb_zone + 1 >= nr_zones) return -ENOENT; - ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift, + ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, zones); if (ret < 0) @@ -826,7 +838,7 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) return -ENOENT; return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sb_zone << zone_sectors_shift, + zone_start_sector(sb_zone, bdev), zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } @@ -878,7 +890,8 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, if (!(end <= sb_zone || sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { have_sb = true; - pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift; + pos = zone_start_physical( + sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); break; } |