summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/block-group.c40
-rw-r--r--fs/btrfs/block-group.h4
-rw-r--r--fs/btrfs/compression.c8
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/dev-replace.c7
-rw-r--r--fs/btrfs/disk-io.c6
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/btrfs/extent_io.c44
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c13
-rw-r--r--fs/btrfs/inode.c50
-rw-r--r--fs/btrfs/ioctl.c22
-rw-r--r--fs/btrfs/scrub.c26
-rw-r--r--fs/btrfs/tree-log.c1
-rw-r--r--fs/btrfs/volumes.c67
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/zoned.c13
-rw-r--r--fs/btrfs/zoned.h4
18 files changed, 206 insertions, 107 deletions
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c22d287e020b..0dd6de994199 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2503,12 +2503,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
return ERR_PTR(ret);
}
- /*
- * New block group is likely to be used soon. Try to activate it now.
- * Failure is OK for now.
- */
- btrfs_zone_activate(cache);
-
ret = exclude_super_stripes(cache);
if (ret) {
/* We may have excluded something, so call this just in case */
@@ -2946,7 +2940,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -3012,7 +3005,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -3113,7 +3105,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3171,7 +3162,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -3455,7 +3445,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
}
-static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
{
struct btrfs_block_group *bg;
int ret;
@@ -3542,7 +3532,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
out:
btrfs_trans_release_chunk_metadata(trans);
- return ret;
+ if (ret)
+ return ERR_PTR(ret);
+
+ btrfs_get_block_group(bg);
+ return bg;
}
/*
@@ -3657,10 +3651,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *space_info;
+ struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
+ bool from_extent_allocation = false;
int ret = 0;
+ if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
+ from_extent_allocation = true;
+ force = CHUNK_ALLOC_FORCE;
+ }
+
/* Don't re-enter if we're already allocating a chunk */
if (trans->allocating_chunk)
return -ENOSPC;
@@ -3750,9 +3751,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- ret = do_chunk_alloc(trans, flags);
+ ret_bg = do_chunk_alloc(trans, flags);
trans->allocating_chunk = false;
+ if (IS_ERR(ret_bg)) {
+ ret = PTR_ERR(ret_bg);
+ } else if (from_extent_allocation) {
+ /*
+ * New block group is likely to be used soon. Try to activate
+ * it now. Failure is OK for now.
+ */
+ btrfs_zone_activate(ret_bg);
+ }
+
+ if (!ret)
+ btrfs_put_block_group(ret_bg);
+
spin_lock(&space_info->lock);
if (ret < 0) {
if (ret == -ENOSPC)
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 93aabc68bb6a..e8308f2ad07d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -35,11 +35,15 @@ enum btrfs_discard_state {
* the FS with empty chunks
*
* CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from
+ * find_free_extent() that also activaes the zone
*/
enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_NO_FORCE,
CHUNK_ALLOC_LIMITED,
CHUNK_ALLOC_FORCE,
+ CHUNK_ALLOC_FORCE_FOR_EXTENT,
};
struct btrfs_caching_control {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index be476f094300..19bf36d8ffea 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -537,6 +537,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->orig_bio = NULL;
cb->nr_pages = nr_pages;
+ if (blkcg_css)
+ kthread_associate_blkcg(blkcg_css);
+
while (cur_disk_bytenr < disk_start + compressed_len) {
u64 offset = cur_disk_bytenr - disk_start;
unsigned int index = offset >> PAGE_SHIFT;
@@ -555,6 +558,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
bio = NULL;
goto finish_cb;
}
+ if (blkcg_css)
+ bio->bi_opf |= REQ_CGROUP_PUNT;
}
/*
* We should never reach next_stripe_start start as we will
@@ -612,6 +617,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
return 0;
finish_cb:
+ if (blkcg_css)
+ kthread_associate_blkcg(NULL);
+
if (bio) {
bio->bi_status = ret;
bio_endio(bio);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b7631b88426e..077c95e9baa5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1060,6 +1060,7 @@ struct btrfs_fs_info {
*/
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
+ struct mutex zoned_data_reloc_io_lock;
u64 nr_global_roots;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 71fd99b48283..f26202621989 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -734,7 +734,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- /* Commit dev_replace state and reserve 1 item for it. */
+ /*
+ * Commit dev_replace state and reserve 1 item for it.
+ * This is crucial to ensure we won't miss copying extents for new block
+ * groups that are allocated after we started the device replace, and
+ * must be done after setting up the device replace state.
+ */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b30309f187cf..ed8e288cc369 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1850,9 +1850,10 @@ again:
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
- btrfs_put_root(root);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
+ btrfs_put_root(root);
goto again;
+ }
goto fail;
}
return root;
@@ -3156,6 +3157,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
+ mutex_init(&fs_info->zoned_data_reloc_io_lock);
seqlock_init(&fs_info->profiles_lock);
INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f477035a2ac2..6aa92f84f465 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4082,7 +4082,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
}
ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
- CHUNK_ALLOC_FORCE);
+ CHUNK_ALLOC_FORCE_FOR_EXTENT);
/* Do not bail out on ENOSPC since we can do more. */
if (ret == -ENOSPC)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 724e8fe06aa0..33c19f51d79b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2658,6 +2658,7 @@ int btrfs_repair_one_sector(struct inode *inode,
repair_bio = btrfs_bio_alloc(1);
repair_bbio = btrfs_bio(repair_bio);
+ repair_bbio->file_offset = start;
repair_bio->bi_opf = REQ_OP_READ;
repair_bio->bi_end_io = failed_bio->bi_end_io;
repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
@@ -3333,24 +3334,37 @@ static int alloc_new_bio(struct btrfs_inode *inode,
ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
if (ret < 0)
goto error;
- if (wbc) {
- struct block_device *bdev;
- bdev = fs_info->fs_devices->latest_dev->bdev;
- bio_set_dev(bio, bdev);
- wbc_init_bio(wbc, bio);
- }
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- struct btrfs_device *device;
+ if (wbc) {
+ /*
+ * For Zone append we need the correct block_device that we are
+ * going to write to set in the bio to be able to respect the
+ * hardware limitation. Look it up here:
+ */
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
+ fs_info->sectorsize);
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ goto error;
+ }
- device = btrfs_zoned_get_device(fs_info, disk_bytenr,
- fs_info->sectorsize);
- if (IS_ERR(device)) {
- ret = PTR_ERR(device);
- goto error;
+ bio_set_dev(bio, dev->bdev);
+ } else {
+ /*
+ * Otherwise pick the last added device to support
+ * cgroup writeback. For multi-device file systems this
+ * means blk-cgroup policies have to always be set on the
+ * last added/replaced device. This is a bit odd but has
+ * been like that for a long time.
+ */
+ bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
}
-
- btrfs_bio(bio)->device = device;
+ wbc_init_bio(wbc, bio);
+ } else {
+ ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
}
return 0;
error:
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 0399cf8e3c32..151e9da5da2d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -118,7 +118,7 @@ struct btrfs_bio_ctrl {
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9f455c96c974..380054c94e4b 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2957,8 +2957,9 @@ out:
return ret;
}
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
@@ -2990,6 +2991,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out_only_mutex;
+
lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
@@ -3430,7 +3435,7 @@ static long btrfs_fallocate(struct file *file, int mode,
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return btrfs_punch_hole(inode, offset, len);
+ return btrfs_punch_hole(file, offset, len);
/*
* Only trigger disk allocation, don't trigger qgroup reserve
@@ -3452,6 +3457,10 @@ static long btrfs_fallocate(struct file *file, int mode,
goto out;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
/*
* TODO: Move these two operations after we have checked
* accurate reserved space, or fallocate can still fail but
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6bfc4343c98d..1c8a43ecfb9f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1128,7 +1128,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
- WARN_ON_ONCE(1);
ret = -EINVAL;
goto out_unlock;
}
@@ -2017,8 +2016,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
* to use run_delalloc_nocow() here, like for regular
* preallocated inodes.
*/
- ASSERT(!zoned ||
- (zoned && btrfs_is_data_reloc_root(inode->root)));
+ ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root));
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, nr_written);
} else if (!inode_can_compress(inode) ||
@@ -4488,6 +4486,13 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
dest->root_key.objectid);
return -EPERM;
}
+ if (atomic_read(&dest->nr_swapfiles)) {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu with active swapfile",
+ root->root_key.objectid);
+ return -EPERM;
+ }
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@ -7438,6 +7443,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
u64 block_start, orig_start, orig_block_len, ram_bytes;
bool can_nocow = false;
bool space_reserved = false;
+ u64 prev_len;
int ret = 0;
/*
@@ -7465,6 +7471,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
can_nocow = true;
}
+ prev_len = len;
if (can_nocow) {
struct extent_map *em2;
@@ -7494,8 +7501,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
goto out;
}
} else {
- const u64 prev_len = len;
-
/* Our caller expects us to free the input extent map. */
free_extent_map(em);
*map = NULL;
@@ -7526,7 +7531,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* We have created our ordered extent, so we can now release our reservation
* for an outstanding extent.
*/
- btrfs_delalloc_release_extents(BTRFS_I(inode), len);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
/*
* Need to update the i_size under the extent lock so buffered
@@ -7805,8 +7810,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
struct bio_vec bvec;
struct bvec_iter iter;
- const u64 orig_file_offset = dip->file_offset;
- u64 start = orig_file_offset;
u32 bio_offset = 0;
blk_status_t err = BLK_STS_OK;
@@ -7816,6 +7819,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
pgoff = bvec.bv_offset;
for (i = 0; i < nr_sectors; i++) {
+ u64 start = bbio->file_offset + bio_offset;
+
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
(!csum || !check_data_csum(inode, bbio,
@@ -7828,17 +7833,13 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
} else {
int ret;
- ASSERT((start - orig_file_offset) < UINT_MAX);
- ret = btrfs_repair_one_sector(inode,
- &bbio->bio,
- start - orig_file_offset,
- bvec.bv_page, pgoff,
+ ret = btrfs_repair_one_sector(inode, &bbio->bio,
+ bio_offset, bvec.bv_page, pgoff,
start, bbio->mirror_num,
submit_dio_repair_bio);
if (ret)
err = errno_to_blk_status(ret);
}
- start += sectorsize;
ASSERT(bio_offset + sectorsize > bio_offset);
bio_offset += sectorsize;
pgoff += sectorsize;
@@ -7865,6 +7866,7 @@ static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
static void btrfs_end_dio_bio(struct bio *bio)
{
struct btrfs_dio_private *dip = bio->bi_private;
+ struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t err = bio->bi_status;
if (err)
@@ -7875,12 +7877,12 @@ static void btrfs_end_dio_bio(struct bio *bio)
bio->bi_iter.bi_size, err);
if (bio_op(bio) == REQ_OP_READ)
- err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
+ err = btrfs_check_read_dio_bio(dip, bbio, !err);
if (err)
dip->dio_bio->bi_status = err;
- btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
+ btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio);
bio_put(bio);
btrfs_dio_private_put(dip);
@@ -8041,6 +8043,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
+ btrfs_bio(bio)->file_offset = file_offset;
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
@@ -11107,8 +11110,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
+ *
+ * It is possible that subvolume is marked for deletion but still not
+ * removed yet. To prevent this race, we check the root status before
+ * activating the swapfile.
*/
+ spin_lock(&root->root_item_lock);
+ if (btrfs_root_dead(root)) {
+ spin_unlock(&root->root_item_lock);
+
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because subvolume %llu is being deleted",
+ root->root_key.objectid);
+ return -EPERM;
+ }
atomic_inc(&root->nr_swapfiles);
+ spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 238cee5b5254..be6c24577dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1239,7 +1239,7 @@ static u32 get_extent_max_capacity(const struct extent_map *em)
}
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
- bool locked)
+ u32 extent_thresh, u64 newer_than, bool locked)
{
struct extent_map *next;
bool ret = false;
@@ -1249,11 +1249,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
return false;
/*
- * We want to check if the next extent can be merged with the current
- * one, which can be an extent created in a past generation, so we pass
- * a minimum generation of 0 to defrag_lookup_extent().
+ * Here we need to pass @newer_then when checking the next extent, or
+ * we will hit a case we mark current extent for defrag, but the next
+ * one will not be a target.
+ * This will just cause extra IO without really reducing the fragments.
*/
- next = defrag_lookup_extent(inode, em->start + em->len, 0, locked);
+ next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
/* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
goto out;
@@ -1265,6 +1266,13 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
*/
if (next->len >= get_extent_max_capacity(em))
goto out;
+ /* Skip older extent */
+ if (next->generation < newer_than)
+ goto out;
+ /* Also check extent size */
+ if (next->len >= extent_thresh)
+ goto out;
+
ret = true;
out:
free_extent_map(next);
@@ -1470,7 +1478,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto next;
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
- locked);
+ extent_thresh, newer_than, locked);
if (!next_mergeable) {
struct defrag_target_range *last;
@@ -5448,8 +5456,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_fs_info(fs_info, argp);
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
- case BTRFS_IOC_BALANCE:
- return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(inode, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 11089568b287..8cd713d37ad2 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3699,6 +3699,31 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
if (!cache)
goto skip;
+ ASSERT(cache->start <= chunk_offset);
+ /*
+ * We are using the commit root to search for device extents, so
+ * that means we could have found a device extent item from a
+ * block group that was deleted in the current transaction. The
+ * logical start offset of the deleted block group, stored at
+ * @chunk_offset, might be part of the logical address range of
+ * a new block group (which uses different physical extents).
+ * In this case btrfs_lookup_block_group() has returned the new
+ * block group, and its start address is less than @chunk_offset.
+ *
+ * We skip such new block groups, because it's pointless to
+ * process them, as we won't find their extents because we search
+ * for them using the commit root of the extent tree. For a device
+ * replace it's also fine to skip it, we won't miss copying them
+ * to the target device because we have the write duplication
+ * setup through the regular write path (by btrfs_map_block()),
+ * and we have committed a transaction when we started the device
+ * replace, right after setting up the device replace state.
+ */
+ if (cache->start < chunk_offset) {
+ btrfs_put_block_group(cache);
+ goto skip;
+ }
+
if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
spin_lock(&cache->lock);
if (!cache->to_copy) {
@@ -3822,7 +3847,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
- ASSERT(cache->start == chunk_offset);
ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
dev_extent_len);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 571dae8ad65e..09e4f1a04e6f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3188,6 +3188,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
mutex_unlock(&fs_info->tree_root->log_mutex);
+ blk_finish_plug(&plug);
goto out;
}
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 1be7cb2f955f..a8cc736731fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1896,23 +1896,18 @@ static void update_dev_time(const char *device_path)
path_put(&path);
}
-static int btrfs_rm_dev_item(struct btrfs_device *device)
+static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
{
struct btrfs_root *root = device->fs_info->chunk_root;
int ret;
struct btrfs_path *path;
struct btrfs_key key;
- struct btrfs_trans_handle *trans;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- btrfs_free_path(path);
- return PTR_ERR(trans);
- }
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.type = BTRFS_DEV_ITEM_KEY;
key.offset = device->devid;
@@ -1923,21 +1918,12 @@ static int btrfs_rm_dev_item(struct btrfs_device *device)
if (ret) {
if (ret > 0)
ret = -ENOENT;
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_del_item(trans, root, path);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- btrfs_end_transaction(trans);
- }
-
out:
btrfs_free_path(path);
- if (!ret)
- ret = btrfs_commit_transaction(trans);
return ret;
}
@@ -2078,6 +2064,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
struct btrfs_dev_lookup_args *args,
struct block_device **bdev, fmode_t *mode)
{
+ struct btrfs_trans_handle *trans;
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2098,7 +2085,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
- goto out;
+ return ret;
device = btrfs_find_device(fs_info->fs_devices, args);
if (!device) {
@@ -2106,27 +2093,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = -ENOENT;
- goto out;
+ return ret;
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
btrfs_warn_in_rcu(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
rcu_str_deref(device->name), device->devid);
- ret = -ETXTBSY;
- goto out;
+ return -ETXTBSY;
}
- if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
- ret = BTRFS_ERROR_DEV_TGT_REPLACE;
- goto out;
- }
+ if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+ return BTRFS_ERROR_DEV_TGT_REPLACE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
- fs_info->fs_devices->rw_devices == 1) {
- ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
- goto out;
- }
+ fs_info->fs_devices->rw_devices == 1)
+ return BTRFS_ERROR_DEV_ONLY_WRITABLE;
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex);
@@ -2139,14 +2121,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
if (ret)
goto error_undo;
- /*
- * TODO: the superblock still includes this device in its num_devices
- * counter although write_all_supers() is not locked out. This
- * could give a filesystem state which requires a degraded mount.
- */
- ret = btrfs_rm_dev_item(device);
- if (ret)
+ trans = btrfs_start_transaction(fs_info->chunk_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
goto error_undo;
+ }
+
+ ret = btrfs_rm_dev_item(trans, device);
+ if (ret) {
+ /* Any error in dev item removal is critical */
+ btrfs_crit(fs_info,
+ "failed to remove device item for devid %llu: %d",
+ device->devid, ret);
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
btrfs_scrub_cancel_dev(device);
@@ -2229,7 +2219,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
free_fs_devices(cur_devices);
}
-out:
+ ret = btrfs_commit_transaction(trans);
+
return ret;
error_undo:
@@ -2240,7 +2231,7 @@ error_undo:
device->fs_devices->rw_devices++;
mutex_unlock(&fs_info->chunk_mutex);
}
- goto out;
+ return ret;
}
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
@@ -4439,10 +4430,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bd297f23d19e..f3e28f11cfb6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -328,6 +328,9 @@ struct btrfs_fs_devices {
struct btrfs_bio {
unsigned int mirror_num;
+ /* for direct I/O */
+ u64 file_offset;
+
/* @device is for stripe IO submission. */
struct btrfs_device *device;
u8 *csum;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index b7b5fac1c779..1b1b310c3c51 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1801,7 +1801,6 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
map = em->map_lookup;
/* We only support single profile for now */
- ASSERT(map->num_stripes == 1);
device = map->stripes[0].dev;
free_extent_map(em);
@@ -1976,18 +1975,16 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
{
+ struct btrfs_fs_info *fs_info = fs_devices->fs_info;
struct btrfs_device *device;
bool ret = false;
- if (!btrfs_is_zoned(fs_devices->fs_info))
+ if (!btrfs_is_zoned(fs_info))
return true;
- /* Non-single profiles are not supported yet */
- ASSERT((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0);
-
/* Check if there is a device with active zones left */
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ mutex_lock(&fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
struct btrfs_zoned_device_info *zinfo = device->zone_info;
if (!device->bdev)
@@ -1999,7 +1996,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
break;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ mutex_unlock(&fs_info->chunk_mutex);
return ret;
}
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index cbf016a7bb5d..6dee76248cb4 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -359,7 +359,7 @@ static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
- btrfs_inode_lock(&inode->vfs_inode, 0);
+ mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
}
static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
@@ -367,7 +367,7 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
struct btrfs_root *root = inode->root;
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
- btrfs_inode_unlock(&inode->vfs_inode, 0);
+ mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
}
#endif