summaryrefslogtreecommitdiffstats
path: root/fs/btrfs/ioctl.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-12 20:47:51 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-12 20:47:51 -0800
commit149c51f876322d9bfbd5e2d6ffae7aff3d794384 (patch)
treea61c7dd828356e307fca06fc66dbdbf9b109c18f /fs/btrfs/ioctl.c
parent97971df811b8854882c0f6c6631e23ab8cdcc44f (diff)
parentb7af0635c87ff78d6bd523298ab7471f9ffd3ce5 (diff)
downloadlinux-149c51f876322d9bfbd5e2d6ffae7aff3d794384.tar.bz2
Merge tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "This round there are a lot of cleanups and moved code so the diffstat looks huge, otherwise there are some nice performance improvements and an update to raid56 reliability. User visible features: - raid56 reliability vs performance trade off: - fix destructive RMW for raid5 data (raid6 still needs work): do full checksum verification for all data during RMW cycle, this should prevent rewriting potentially corrupted data without notice - stripes are cached in memory which should reduce the performance impact but still can hurt some workloads - checksums are verified after repair again - this is the last option without introducing additional features (write intent bitmap, journal, another tree), the extra checksum read/verification was supposed to be avoided by the original implementation exactly for performance reasons but that caused all the reliability problems - discard=async by default for devices that support it - implement emergency flush reserve to avoid almost all unnecessary transaction aborts due to ENOSPC in cases where there are too many delayed refs or delayed allocation - skip block group synchronization if there's no change in used bytes, can reduce transaction commit count for some workloads Performance improvements: - fiemap and lseek: - overall speedup due to skipping unnecessary or duplicate searches (-40% run time) - cache some data structures and sharedness of extents (-30% run time) - send: - faster backref resolution when finding clones - cached leaf to root mapping for faster backref walking - improved clone/sharing detection - overall run time improvements (-70%) Core: - module initialization converted to a table of function pointers run in a sequence - preparation for fscrypt, extend passing file names across calls, dir item can store encryption status - raid56 updates: - more accurate error tracking of sectors within stripe - simplify recovery path and remove dedicated endio worker kthread - simplify scrub call paths - refactoring to support the extra data checksum verification during RMW cycle - tree block parentness checks consolidated and done at metadata read time - improved error handling - cleanups: - move a lot of code for better synchronization between kernel and user space sources, split big files - enum cleanups - GFP flag cleanups - header file cleanups, prototypes, dependencies - redundant parameter cleanups - inline extent handling simplifications - inode parameter conversion - data structure cleanups, reductions, renames, merges" * tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (249 commits) btrfs: print transaction aborted messages with an error level btrfs: sync some cleanups from progs into uapi/btrfs.h btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range btrfs: fix extent map use-after-free when handling missing device in read_one_chunk btrfs: remove outdated logic from overwrite_item() and add assertion btrfs: unify overwrite_item() and do_overwrite_item() btrfs: replace strncpy() with strscpy() btrfs: fix uninitialized variable in find_first_clear_extent_bit btrfs: fix uninitialized parent in insert_state btrfs: add might_sleep() annotations btrfs: add stack helpers for a few btrfs items btrfs: add nr_global_roots to the super block definition btrfs: remove BTRFS_LEAF_DATA_OFFSET btrfs: add helpers for manipulating leaf items and data btrfs: add eb to btrfs_node_key_ptr_offset btrfs: pass the extent buffer for the btrfs_item_nr helpers btrfs: move the csum helpers into ctree.h btrfs: move eb offset helpers into extent_io.h btrfs: move file_extent_item helpers into file-item.h btrfs: move leaf_data_end into ctree.c ...
Diffstat (limited to 'fs/btrfs/ioctl.c')
-rw-r--r--fs/btrfs/ioctl.c945
1 files changed, 26 insertions, 919 deletions
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f897be9ec1e9..7e348bd2ccde 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,6 +50,17 @@
#include "delalloc-space.h"
#include "block-group.h"
#include "subpage.h"
+#include "fs.h"
+#include "accessors.h"
+#include "extent-tree.h"
+#include "root-tree.h"
+#include "defrag.h"
+#include "dir-item.h"
+#include "uuid-tree.h"
+#include "ioctl.h"
+#include "file.h"
+#include "scrub.h"
+#include "super.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
@@ -949,6 +960,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
struct inode *dir = d_inode(parent->dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct dentry *dentry;
+ struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
int error;
error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
@@ -969,8 +981,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
* check for them now when we can safely fail
*/
error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
- dir->i_ino, name,
- namelen);
+ dir->i_ino, &name_str);
if (error)
goto out_dput;
@@ -991,7 +1002,7 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
- btrfs_inode_unlock(dir, 0);
+ btrfs_inode_unlock(BTRFS_I(dir), 0);
return error;
}
@@ -1036,908 +1047,6 @@ out:
}
/*
- * Defrag specific helper to get an extent map.
- *
- * Differences between this and btrfs_get_extent() are:
- *
- * - No extent_map will be added to inode->extent_tree
- * To reduce memory usage in the long run.
- *
- * - Extra optimization to skip file extents older than @newer_than
- * By using btrfs_search_forward() we can skip entire file ranges that
- * have extents created in past transactions, because btrfs_search_forward()
- * will not visit leaves and nodes with a generation smaller than given
- * minimal generation threshold (@newer_than).
- *
- * Return valid em if we find a file extent matching the requirement.
- * Return NULL if we can not find a file extent matching the requirement.
- *
- * Return ERR_PTR() for error.
- */
-static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
- u64 start, u64 newer_than)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_file_extent_item *fi;
- struct btrfs_path path = { 0 };
- struct extent_map *em;
- struct btrfs_key key;
- u64 ino = btrfs_ino(inode);
- int ret;
-
- em = alloc_extent_map();
- if (!em) {
- ret = -ENOMEM;
- goto err;
- }
-
- key.objectid = ino;
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = start;
-
- if (newer_than) {
- ret = btrfs_search_forward(root, &key, &path, newer_than);
- if (ret < 0)
- goto err;
- /* Can't find anything newer */
- if (ret > 0)
- goto not_found;
- } else {
- ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
- if (ret < 0)
- goto err;
- }
- if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
- /*
- * If btrfs_search_slot() makes path to point beyond nritems,
- * we should not have an empty leaf, as this inode must at
- * least have its INODE_ITEM.
- */
- ASSERT(btrfs_header_nritems(path.nodes[0]));
- path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
- }
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
- /* Perfect match, no need to go one slot back */
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
- key.offset == start)
- goto iterate;
-
- /* We didn't find a perfect match, needs to go one slot back */
- if (path.slots[0] > 0) {
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
- if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
- path.slots[0]--;
- }
-
-iterate:
- /* Iterate through the path to find a file extent covering @start */
- while (true) {
- u64 extent_end;
-
- if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
- goto next;
-
- btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
-
- /*
- * We may go one slot back to INODE_REF/XATTR item, then
- * need to go forward until we reach an EXTENT_DATA.
- * But we should still has the correct ino as key.objectid.
- */
- if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
- goto next;
-
- /* It's beyond our target range, definitely not extent found */
- if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
- goto not_found;
-
- /*
- * | |<- File extent ->|
- * \- start
- *
- * This means there is a hole between start and key.offset.
- */
- if (key.offset > start) {
- em->start = start;
- em->orig_start = start;
- em->block_start = EXTENT_MAP_HOLE;
- em->len = key.offset - start;
- break;
- }
-
- fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
- struct btrfs_file_extent_item);
- extent_end = btrfs_file_extent_end(&path);
-
- /*
- * |<- file extent ->| |
- * \- start
- *
- * We haven't reached start, search next slot.
- */
- if (extent_end <= start)
- goto next;
-
- /* Now this extent covers @start, convert it to em */
- btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
- break;
-next:
- ret = btrfs_next_item(root, &path);
- if (ret < 0)
- goto err;
- if (ret > 0)
- goto not_found;
- }
- btrfs_release_path(&path);
- return em;
-
-not_found:
- btrfs_release_path(&path);
- free_extent_map(em);
- return NULL;
-
-err:
- btrfs_release_path(&path);
- free_extent_map(em);
- return ERR_PTR(ret);
-}
-
-static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
- u64 newer_than, bool locked)
-{
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- struct extent_map *em;
- const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
-
- /*
- * hopefully we have this extent in the tree already, try without
- * the full extent lock
- */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, sectorsize);
- read_unlock(&em_tree->lock);
-
- /*
- * We can get a merged extent, in that case, we need to re-search
- * tree to get the original em for defrag.
- *
- * If @newer_than is 0 or em::generation < newer_than, we can trust
- * this em, as either we don't care about the generation, or the
- * merged extent map will be rejected anyway.
- */
- if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
- newer_than && em->generation >= newer_than) {
- free_extent_map(em);
- em = NULL;
- }
-
- if (!em) {
- struct extent_state *cached = NULL;
- u64 end = start + sectorsize - 1;
-
- /* get the big lock and read metadata off disk */
- if (!locked)
- lock_extent(io_tree, start, end, &cached);
- em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
- if (!locked)
- unlock_extent(io_tree, start, end, &cached);
-
- if (IS_ERR(em))
- return NULL;
- }
-
- return em;
-}
-
-static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
- const struct extent_map *em)
-{
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- return BTRFS_MAX_COMPRESSED;
- return fs_info->max_extent_size;
-}
-
-static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
- u32 extent_thresh, u64 newer_than, bool locked)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_map *next;
- bool ret = false;
-
- /* this is the last extent */
- if (em->start + em->len >= i_size_read(inode))
- return false;
-
- /*
- * Here we need to pass @newer_then when checking the next extent, or
- * we will hit a case we mark current extent for defrag, but the next
- * one will not be a target.
- * This will just cause extra IO without really reducing the fragments.
- */
- next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked);
- /* No more em or hole */
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
- goto out;
- if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
- goto out;
- /*
- * If the next extent is at its max capacity, defragging current extent
- * makes no sense, as the total number of extents won't change.
- */
- if (next->len >= get_extent_max_capacity(fs_info, em))
- goto out;
- /* Skip older extent */
- if (next->generation < newer_than)
- goto out;
- /* Also check extent size */
- if (next->len >= extent_thresh)
- goto out;
-
- ret = true;
-out:
- free_extent_map(next);
- return ret;
-}
-
-/*
- * Prepare one page to be defragged.
- *
- * This will ensure:
- *
- * - Returned page is locked and has been set up properly.
- * - No ordered extent exists in the page.
- * - The page is uptodate.
- *
- * NOTE: Caller should also wait for page writeback after the cluster is
- * prepared, here we don't do writeback wait for each page.
- */
-static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
- pgoff_t index)
-{
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- gfp_t mask = btrfs_alloc_write_mask(mapping);
- u64 page_start = (u64)index << PAGE_SHIFT;
- u64 page_end = page_start + PAGE_SIZE - 1;
- struct extent_state *cached_state = NULL;
- struct page *page;
- int ret;
-
-again:
- page = find_or_create_page(mapping, index, mask);
- if (!page)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Since we can defragment files opened read-only, we can encounter
- * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
- * can't do I/O using huge pages yet, so return an error for now.
- * Filesystem transparent huge pages are typically only used for
- * executables that explicitly enable them, so this isn't very
- * restrictive.
- */
- if (PageCompound(page)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-ETXTBSY);
- }
-
- ret = set_page_extent_mapped(page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(ret);
- }
-
- /* Wait for any existing ordered extent in the range */
- while (1) {
- struct btrfs_ordered_extent *ordered;
-
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
- if (!ordered)
- break;
-
- unlock_page(page);
- btrfs_start_ordered_extent(ordered, 1);
- btrfs_put_ordered_extent(ordered);
- lock_page(page);
- /*
- * We unlocked the page above, so we need check if it was
- * released or not.
- */
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
- goto again;
- }
- }
-
- /*
- * Now the page range has no ordered extent any more. Read the page to
- * make it uptodate.
- */
- if (!PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != mapping || !PagePrivate(page)) {
- unlock_page(page);
- put_page(page);
- goto again;
- }
- if (!PageUptodate(page)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EIO);
- }
- }
- return page;
-}
-
-struct defrag_target_range {
- struct list_head list;
- u64 start;
- u64 len;
-};
-
-/*
- * Collect all valid target extents.
- *
- * @start: file offset to lookup
- * @len: length to lookup
- * @extent_thresh: file extent size threshold, any extent size >= this value
- * will be ignored
- * @newer_than: only defrag extents newer than this value
- * @do_compress: whether the defrag is doing compression
- * if true, @extent_thresh will be ignored and all regular
- * file extents meeting @newer_than will be targets.
- * @locked: if the range has already held extent lock
- * @target_list: list of targets file extents
- */
-static int defrag_collect_targets(struct btrfs_inode *inode,
- u64 start, u64 len, u32 extent_thresh,
- u64 newer_than, bool do_compress,
- bool locked, struct list_head *target_list,
- u64 *last_scanned_ret)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- bool last_is_target = false;
- u64 cur = start;
- int ret = 0;
-
- while (cur < start + len) {
- struct extent_map *em;
- struct defrag_target_range *new;
- bool next_mergeable = true;
- u64 range_len;
-
- last_is_target = false;
- em = defrag_lookup_extent(&inode->vfs_inode, cur,
- newer_than, locked);
- if (!em)
- break;
-
- /*
- * If the file extent is an inlined one, we may still want to
- * defrag it (fallthrough) if it will cause a regular extent.
- * This is for users who want to convert inline extents to
- * regular ones through max_inline= mount option.
- */
- if (em->block_start == EXTENT_MAP_INLINE &&
- em->len <= inode->root->fs_info->max_inline)
- goto next;
-
- /* Skip hole/delalloc/preallocated extents */
- if (em->block_start == EXTENT_MAP_HOLE ||
- em->block_start == EXTENT_MAP_DELALLOC ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- goto next;
-
- /* Skip older extent */
- if (em->generation < newer_than)
- goto next;
-
- /* This em is under writeback, no need to defrag */
- if (em->generation == (u64)-1)
- goto next;
-
- /*
- * Our start offset might be in the middle of an existing extent
- * map, so take that into account.
- */
- range_len = em->len - (cur - em->start);
- /*
- * If this range of the extent map is already flagged for delalloc,
- * skip it, because:
- *
- * 1) We could deadlock later, when trying to reserve space for
- * delalloc, because in case we can't immediately reserve space
- * the flusher can start delalloc and wait for the respective
- * ordered extents to complete. The deadlock would happen
- * because we do the space reservation while holding the range
- * locked, and starting writeback, or finishing an ordered
- * extent, requires locking the range;
- *
- * 2) If there's delalloc there, it means there's dirty pages for
- * which writeback has not started yet (we clean the delalloc
- * flag when starting writeback and after creating an ordered
- * extent). If we mark pages in an adjacent range for defrag,
- * then we will have a larger contiguous range for delalloc,
- * very likely resulting in a larger extent after writeback is
- * triggered (except in a case of free space fragmentation).
- */
- if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC, 0, NULL))
- goto next;
-
- /*
- * For do_compress case, we want to compress all valid file
- * extents, thus no @extent_thresh or mergeable check.
- */
- if (do_compress)
- goto add;
-
- /* Skip too large extent */
- if (range_len >= extent_thresh)
- goto next;
-
- /*
- * Skip extents already at its max capacity, this is mostly for
- * compressed extents, which max cap is only 128K.
- */
- if (em->len >= get_extent_max_capacity(fs_info, em))
- goto next;
-
- /*
- * Normally there are no more extents after an inline one, thus
- * @next_mergeable will normally be false and not defragged.
- * So if an inline extent passed all above checks, just add it
- * for defrag, and be converted to regular extents.
- */
- if (em->block_start == EXTENT_MAP_INLINE)
- goto add;
-
- next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
- extent_thresh, newer_than, locked);
- if (!next_mergeable) {
- struct defrag_target_range *last;
-
- /* Empty target list, no way to merge with last entry */
- if (list_empty(target_list))
- goto next;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
- /* Not mergeable with last entry */
- if (last->start + last->len != cur)
- goto next;
-
- /* Mergeable, fall through to add it to @target_list. */
- }
-
-add:
- last_is_target = true;
- range_len = min(extent_map_end(em), start + len) - cur;
- /*
- * This one is a good target, check if it can be merged into
- * last range of the target list.
- */
- if (!list_empty(target_list)) {
- struct defrag_target_range *last;
-
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
- ASSERT(last->start + last->len <= cur);
- if (last->start + last->len == cur) {
- /* Mergeable, enlarge the last entry */
- last->len += range_len;
- goto next;
- }
- /* Fall through to allocate a new entry */
- }
-
- /* Allocate new defrag_target_range */
- new = kmalloc(sizeof(*new), GFP_NOFS);
- if (!new) {
- free_extent_map(em);
- ret = -ENOMEM;
- break;
- }
- new->start = cur;
- new->len = range_len;
- list_add_tail(&new->list, target_list);
-
-next:
- cur = extent_map_end(em);
- free_extent_map(em);
- }
- if (ret < 0) {
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
-
- list_for_each_entry_safe(entry, tmp, target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
- }
- if (!ret && last_scanned_ret) {
- /*
- * If the last extent is not a target, the caller can skip to
- * the end of that extent.
- * Otherwise, we can only go the end of the specified range.
- */
- if (!last_is_target)
- *last_scanned_ret = max(cur, *last_scanned_ret);
- else
- *last_scanned_ret = max(start + len, *last_scanned_ret);
- }
- return ret;
-}
-
-#define CLUSTER_SIZE (SZ_256K)
-static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
-
-/*
- * Defrag one contiguous target range.
- *
- * @inode: target inode
- * @target: target range to defrag
- * @pages: locked pages covering the defrag range
- * @nr_pages: number of locked pages
- *
- * Caller should ensure:
- *
- * - Pages are prepared
- * Pages should be locked, no ordered extent in the pages range,
- * no writeback.
- *
- * - Extent bits are locked
- */
-static int defrag_one_locked_target(struct btrfs_inode *inode,
- struct defrag_target_range *target,
- struct page **pages, int nr_pages,
- struct extent_state **cached_state)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_changeset *data_reserved = NULL;
- const u64 start = target->start;
- const u64 len = target->len;
- unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
- unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = page_index(pages[0]);
- int ret = 0;
- int i;
-
- ASSERT(last_index - first_index + 1 <= nr_pages);
-
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
- if (ret < 0)
- return ret;
- clear_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, cached_state);
- set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
-
- /* Update the page status */
- for (i = start_index - first_index; i <= last_index - first_index; i++) {
- ClearPageChecked(pages[i]);
- btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
- }
- btrfs_delalloc_release_extents(inode, len);
- extent_changeset_free(data_reserved);
-
- return ret;
-}
-
-static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
- u32 extent_thresh, u64 newer_than, bool do_compress,
- u64 *last_scanned_ret)
-{
- struct extent_state *cached_state = NULL;
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
- LIST_HEAD(target_list);
- struct page **pages;
- const u32 sectorsize = inode->root->fs_info->sectorsize;
- u64 last_index = (start + len - 1) >> PAGE_SHIFT;
- u64 start_index = start >> PAGE_SHIFT;
- unsigned int nr_pages = last_index - start_index + 1;
- int ret = 0;
- int i;
-
- ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
- ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
-
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
- if (!pages)
- return -ENOMEM;
-
- /* Prepare all pages */
- for (i = 0; i < nr_pages; i++) {
- pages[i] = defrag_prepare_one_page(inode, start_index + i);
- if (IS_ERR(pages[i])) {
- ret = PTR_ERR(pages[i]);
- pages[i] = NULL;
- goto free_pages;
- }
- }
- for (i = 0; i < nr_pages; i++)
- wait_on_page_writeback(pages[i]);
-
- /* Lock the pages range */
- lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
- /*
- * Now we have a consistent view about the extent map, re-check
- * which range really needs to be defragged.
- *
- * And this time we have extent locked already, pass @locked = true
- * so that we won't relock the extent range and cause deadlock.
- */
- ret = defrag_collect_targets(inode, start, len, extent_thresh,
- newer_than, do_compress, true,
- &target_list, last_scanned_ret);
- if (ret < 0)
- goto unlock_extent;
-
- list_for_each_entry(entry, &target_list, list) {
- ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
- &cached_state);
- if (ret < 0)
- break;
- }
-
- list_for_each_entry_safe(entry, tmp, &target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
-unlock_extent:
- unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
-free_pages:
- for (i = 0; i < nr_pages; i++) {
- if (pages[i]) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
- }
- kfree(pages);
- return ret;
-}
-
-static int defrag_one_cluster(struct btrfs_inode *inode,
- struct file_ra_state *ra,
- u64 start, u32 len, u32 extent_thresh,
- u64 newer_than, bool do_compress,
- unsigned long *sectors_defragged,
- unsigned long max_sectors,
- u64 *last_scanned_ret)
-{
- const u32 sectorsize = inode->root->fs_info->sectorsize;
- struct defrag_target_range *entry;
- struct defrag_target_range *tmp;
- LIST_HEAD(target_list);
- int ret;
-
- ret = defrag_collect_targets(inode, start, len, extent_thresh,
- newer_than, do_compress, false,
- &target_list, NULL);
- if (ret < 0)
- goto out;
-
- list_for_each_entry(entry, &target_list, list) {
- u32 range_len = entry->len;
-
- /* Reached or beyond the limit */
- if (max_sectors && *sectors_defragged >= max_sectors) {
- ret = 1;
- break;
- }
-
- if (max_sectors)
- range_len = min_t(u32, range_len,
- (max_sectors - *sectors_defragged) * sectorsize);
-
- /*
- * If defrag_one_range() has updated last_scanned_ret,
- * our range may already be invalid (e.g. hole punched).
- * Skip if our range is before last_scanned_ret, as there is
- * no need to defrag the range anymore.
- */
- if (entry->start + range_len <= *last_scanned_ret)
- continue;
-
- if (ra)
- page_cache_sync_readahead(inode->vfs_inode.i_mapping,
- ra, NULL, entry->start >> PAGE_SHIFT,
- ((entry->start + range_len - 1) >> PAGE_SHIFT) -
- (entry->start >> PAGE_SHIFT) + 1);
- /*
- * Here we may not defrag any range if holes are punched before
- * we locked the pages.
- * But that's fine, it only affects the @sectors_defragged
- * accounting.
- */
- ret = defrag_one_range(inode, entry->start, range_len,
- extent_thresh, newer_than, do_compress,
- last_scanned_ret);
- if (ret < 0)
- break;
- *sectors_defragged += range_len >>
- inode->root->fs_info->sectorsize_bits;
- }
-out:
- list_for_each_entry_safe(entry, tmp, &target_list, list) {
- list_del_init(&entry->list);
- kfree(entry);
- }
- if (ret >= 0)
- *last_scanned_ret = max(*last_scanned_ret, start + len);
- return ret;
-}
-
-/*
- * Entry point to file defragmentation.
- *
- * @inode: inode to be defragged
- * @ra: readahead state (can be NUL)
- * @range: defrag options including range and flags
- * @newer_than: minimum transid to defrag
- * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
- * will be defragged.
- *
- * Return <0 for error.
- * Return >=0 for the number of sectors defragged, and range->start will be updated
- * to indicate the file offset where next defrag should be started at.
- * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
- * defragging all the range).
- */
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
- struct btrfs_ioctl_defrag_range_args *range,
- u64 newer_than, unsigned long max_to_defrag)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- unsigned long sectors_defragged = 0;
- u64 isize = i_size_read(inode);
- u64 cur;
- u64 last_byte;
- bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
- bool ra_allocated = false;
- int compress_type = BTRFS_COMPRESS_ZLIB;
- int ret = 0;
- u32 extent_thresh = range->extent_thresh;
- pgoff_t start_index;
-
- if (isize == 0)
- return 0;
-
- if (range->start >= isize)
- return -EINVAL;
-
- if (do_compress) {
- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
- return -EINVAL;
- if (range->compress_type)
- compress_type = range->compress_type;
- }
-
- if (extent_thresh == 0)
- extent_thresh = SZ_256K;
-
- if (range->start + range->len > range->start) {
- /* Got a specific range */
- last_byte = min(isize, range->start + range->len);
- } else {
- /* Defrag until file end */
- last_byte = isize;
- }
-
- /* Align the range */
- cur = round_down(range->start, fs_info->sectorsize);
- last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
-
- /*
- * If we were not given a ra, allocate a readahead context. As
- * readahead is just an optimization, defrag will work without it so
- * we don't error out.
- */
- if (!ra) {
- ra_allocated = true;
- ra = kzalloc(sizeof(*ra), GFP_KERNEL);
- if (ra)
- file_ra_state_init(ra, inode->i_mapping);
- }
-
- /*
- * Make writeback start from the beginning of the range, so that the
- * defrag range can be written sequentially.
- */
- start_index = cur >> PAGE_SHIFT;
- if (start_index < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = start_index;
-
- while (cur < last_byte) {
- const unsigned long prev_sectors_defragged = sectors_defragged;
- u64 last_scanned = cur;
- u64 cluster_end;
-
- if (btrfs_defrag_cancelled(fs_info)) {
- ret = -EAGAIN;
- break;
- }
-
- /* We want the cluster end at page boundary when possible */
- cluster_end = (((cur >> PAGE_SHIFT) +
- (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
- cluster_end = min(cluster_end, last_byte);
-
- btrfs_inode_lock(inode, 0);
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- btrfs_inode_unlock(inode, 0);
- break;
- }
- if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
- btrfs_inode_unlock(inode, 0);
- break;
- }
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
- cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress, &sectors_defragged,
- max_to_defrag, &last_scanned);
-
- if (sectors_defragged > prev_sectors_defragged)
- balance_dirty_pages_ratelimited(inode->i_mapping);
-
- btrfs_inode_unlock(inode, 0);
- if (ret < 0)
- break;
- cur = max(cluster_end + 1, last_scanned);
- if (ret > 0) {
- ret = 0;
- break;
- }
- cond_resched();
- }
-
- if (ra_allocated)
- kfree(ra);
- /*
- * Update range.start for autodefrag, this will indicate where to start
- * in next run.
- */
- range->start = cur;
- if (sectors_defragged) {
- /*
- * We have defragged some sectors, for compression case they
- * need to be written back immediately.
- */
- if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
- filemap_flush(inode->i_mapping);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
- }
- if (range->compress_type == BTRFS_COMPRESS_LZO)
- btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
- else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
- btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
- ret = sectors_defragged;
- }
- if (do_compress) {
- btrfs_inode_lock(inode, 0);
- BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- btrfs_inode_unlock(inode, 0);
- }
- return ret;
-}
-
-/*
* Try to start exclusive operation @type or cancel it if it's running.
*
* Return:
@@ -2119,7 +1228,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret == 0 && new_size != old_size)
btrfs_info_in_rcu(fs_info,
"resize device %s (devid %llu) from %llu to %llu",
- rcu_str_deref(device->name), device->devid,
+ btrfs_dev_name(device), device->devid,
old_size, new_size);
out_finish:
btrfs_exclop_finish(fs_info);
@@ -3274,7 +2383,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dentry = btrfs_get_dentry(fs_info->sb,
BTRFS_FIRST_FREE_OBJECTID,
- vol_args2->subvolid, 0, 0);
+ vol_args2->subvolid, 0);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_drop_write;
@@ -3419,16 +2528,16 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- btrfs_inode_lock(inode, 0);
- err = btrfs_delete_subvolume(dir, dentry);
- btrfs_inode_unlock(inode, 0);
+ btrfs_inode_lock(BTRFS_I(inode), 0);
+ err = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ btrfs_inode_unlock(BTRFS_I(inode), 0);
if (!err)
d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
out_unlock_dir:
- btrfs_inode_unlock(dir, 0);
+ btrfs_inode_unlock(BTRFS_I(dir), 0);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
@@ -3750,13 +2859,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
di_args->bytes_used = btrfs_device_get_bytes_used(dev);
di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
- if (dev->name) {
- strncpy(di_args->path, rcu_str_deref(dev->name),
- sizeof(di_args->path) - 1);
- di_args->path[sizeof(di_args->path) - 1] = 0;
- } else {
+ if (dev->name)
+ strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path));
+ else
di_args->path[0] = '\0';
- }
out:
rcu_read_unlock();
@@ -3777,6 +2883,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
struct btrfs_trans_handle *trans;
struct btrfs_path *path = NULL;
struct btrfs_disk_key disk_key;
+ struct fscrypt_str name = FSTR_INIT("default", 7);
u64 objectid = 0;
u64 dir_id;
int ret;
@@ -3820,7 +2927,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
- dir_id, "default", 7, 1);
+ dir_id, &name, 1);
if (IS_ERR_OR_NULL(di)) {
btrfs_release_path(path);
btrfs_end_transaction(trans);
@@ -4341,7 +3448,7 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->balance_lock);
}
-/**
+/*
* Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
* required.
*