diff options
Diffstat (limited to 'fs')
89 files changed, 3856 insertions, 2092 deletions
diff --git a/fs/Makefile b/fs/Makefile index f79cf4043e60..79f522575cba 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -63,10 +63,11 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ -obj-$(CONFIG_EXT2_FS) += ext2/ -# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 -# unless explicitly requested by rootfstype obj-$(CONFIG_EXT4_FS) += ext4/ +# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the +# ext2 driver, which doesn't know about journalling! Explicitly request ext2 +# by giving the rootfstype= parameter. +obj-$(CONFIG_EXT2_FS) += ext2/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-$(CONFIG_SQUASHFS) += squashfs/ diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9a2ec79e8cfb..6dcdb2ec9211 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -362,6 +362,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } + if (btrfs_test_is_dummy_root(root)) { + srcu_read_unlock(&fs_info->subvol_srcu, index); + ret = -ENOENT; + goto out; + } + if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); else if (time_seq == (u64)-1) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 541fbfaed276..0340c57bf377 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); - return -1; + return -ENOMEM; } list_for_each_entry(device, dev_head, dev_list) { @@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->never_written = 0; superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" - " @%llu (%s/%llu/%d)\n", + btrfs_info_in_rcu(device->dev_root->fs_info, + "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)", superblock_bdev, rcu_str_deref(device->name), dev_bytenr, dev_state->name, dev_bytenr, @@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, sizeof(*block_ctx->pagev)) * num_pages, GFP_NOFS); if (!block_ctx->mem_to_free) - return -1; + return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); for (i = 0; i < num_pages; i++) { diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 36dfeff2c1f4..c473c42d7d6c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -744,11 +744,13 @@ out: return ret; } -static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; -static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; -static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; -static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; -static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; +static struct { + struct list_head idle_ws; + spinlock_t ws_lock; + int num_ws; + atomic_t alloc_ws; + wait_queue_head_t ws_wait; +} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, @@ -760,10 +762,10 @@ void __init btrfs_init_compress(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - INIT_LIST_HEAD(&comp_idle_workspace[i]); - spin_lock_init(&comp_workspace_lock[i]); - atomic_set(&comp_alloc_workspace[i], 0); - init_waitqueue_head(&comp_workspace_wait[i]); + INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); + spin_lock_init(&btrfs_comp_ws[i].ws_lock); + atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); + init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); } } @@ -777,38 +779,38 @@ static struct list_head *find_workspace(int type) int cpus = num_online_cpus(); int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; + struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; + spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; + atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; + int *num_ws = &btrfs_comp_ws[idx].num_ws; again: - spin_lock(workspace_lock); - if (!list_empty(idle_workspace)) { - workspace = idle_workspace->next; + spin_lock(ws_lock); + if (!list_empty(idle_ws)) { + workspace = idle_ws->next; list_del(workspace); - (*num_workspace)--; - spin_unlock(workspace_lock); + (*num_ws)--; + spin_unlock(ws_lock); return workspace; } - if (atomic_read(alloc_workspace) > cpus) { + if (atomic_read(alloc_ws) > cpus) { DEFINE_WAIT(wait); - spin_unlock(workspace_lock); - prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); - if (atomic_read(alloc_workspace) > cpus && !*num_workspace) + spin_unlock(ws_lock); + prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(alloc_ws) > cpus && !*num_ws) schedule(); - finish_wait(workspace_wait, &wait); + finish_wait(ws_wait, &wait); goto again; } - atomic_inc(alloc_workspace); - spin_unlock(workspace_lock); + atomic_inc(alloc_ws); + spin_unlock(ws_lock); workspace = btrfs_compress_op[idx]->alloc_workspace(); if (IS_ERR(workspace)) { - atomic_dec(alloc_workspace); - wake_up(workspace_wait); + atomic_dec(alloc_ws); + wake_up(ws_wait); } return workspace; } @@ -820,27 +822,30 @@ again: static void free_workspace(int type, struct list_head *workspace) { int idx = type - 1; - struct list_head *idle_workspace = &comp_idle_workspace[idx]; - spinlock_t *workspace_lock = &comp_workspace_lock[idx]; - atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; - wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; - int *num_workspace = &comp_num_workspace[idx]; - - spin_lock(workspace_lock); - if (*num_workspace < num_online_cpus()) { - list_add(workspace, idle_workspace); - (*num_workspace)++; - spin_unlock(workspace_lock); + struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; + spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; + atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; + wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; + int *num_ws = &btrfs_comp_ws[idx].num_ws; + + spin_lock(ws_lock); + if (*num_ws < num_online_cpus()) { + list_add(workspace, idle_ws); + (*num_ws)++; + spin_unlock(ws_lock); goto wake; } - spin_unlock(workspace_lock); + spin_unlock(ws_lock); btrfs_compress_op[idx]->free_workspace(workspace); - atomic_dec(alloc_workspace); + atomic_dec(alloc_ws); wake: + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); - if (waitqueue_active(workspace_wait)) - wake_up(workspace_wait); + if (waitqueue_active(ws_wait)) + wake_up(ws_wait); } /* @@ -852,11 +857,11 @@ static void free_workspaces(void) int i; for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { - while (!list_empty(&comp_idle_workspace[i])) { - workspace = comp_idle_workspace[i].next; + while (!list_empty(&btrfs_comp_ws[i].idle_ws)) { + workspace = btrfs_comp_ws[i].idle_ws.next; list_del(workspace); btrfs_compress_op[i]->free_workspace(workspace); - atomic_dec(&comp_alloc_workspace[i]); + atomic_dec(&btrfs_comp_ws[i].alloc_ws); } } } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5f745eadf77d..5b8e235c4b6d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, return ret; if (refs == 0) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } } else { @@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, child = read_node_slot(root, mid, 0); if (!child) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } @@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); goto enospc; } wret = balance_node_right(trans, root, mid, left); @@ -4940,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct extent_buffer *leaf; struct btrfs_item *item; - int last_off; - int dsize = 0; + u32 last_off; + u32 dsize = 0; int ret = 0; int wret; int i; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eb90f0f1a124..8c58191249cc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -823,8 +823,18 @@ struct btrfs_disk_balance_args { */ __le64 profiles; - /* usage filter */ - __le64 usage; + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ + union { + __le64 usage; + struct { + __le32 usage_min; + __le32 usage_max; + }; + }; /* devid filter */ __le64 devid; @@ -846,10 +856,27 @@ struct btrfs_disk_balance_args { /* BTRFS_BALANCE_ARGS_* */ __le64 flags; - /* BTRFS_BALANCE_ARGS_LIMIT value */ - __le64 limit; + /* + * BTRFS_BALANCE_ARGS_LIMIT with value 'limit' + * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum + * and maximum + */ + union { + __le64 limit; + struct { + __le32 limit_min; + __le32 limit_max; + }; + }; - __le64 unused[7]; + /* + * Process chunks that cross stripes_min..stripes_max devices, + * BTRFS_BALANCE_ARGS_STRIPES_RANGE + */ + __le32 stripes_min; + __le32 stripes_max; + + __le64 unused[6]; } __attribute__ ((__packed__)); /* @@ -1154,6 +1181,10 @@ struct btrfs_space_info { delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + u64 max_extent_size; /* This will hold the maximum extent size of + the space info if we had an ENOSPC in the + allocator. */ + unsigned int full:1; /* indicates that we cannot allocate any more chunks for this space */ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ @@ -1228,6 +1259,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* We did a full search and couldn't create a cluster */ + bool fragmented; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -1943,6 +1977,9 @@ struct btrfs_root { int send_in_progress; struct btrfs_subvolume_writers *subv_writers; atomic_t will_be_snapshoted; + + /* For qgroup metadata space reserve */ + atomic_t qgroup_meta_rsv; }; struct btrfs_ioctl_defrag_range_args { @@ -2145,6 +2182,8 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) +#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) +#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2169,6 +2208,18 @@ struct btrfs_ioctl_defrag_range_args { btrfs_clear_opt(root->fs_info->mount_opt, opt); \ } +#ifdef CONFIG_BTRFS_DEBUG +static inline int +btrfs_should_fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + return (btrfs_test_opt(root, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(root, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + /* * Requests for changes that need to be done during transaction commit. * @@ -3379,7 +3430,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins); + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, @@ -3398,7 +3450,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota); + u64 owner, u64 offset); int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, int delalloc); @@ -3411,7 +3463,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, int no_quota); + u64 root_objectid, u64 owner, u64 offset); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -3449,8 +3501,11 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len); +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); @@ -3466,8 +3521,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, unsigned short type); @@ -4004,8 +4059,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, /* sysfs.c */ int btrfs_init_sysfs(void); void btrfs_exit_sysfs(void); -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info); +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); /* xattr.c */ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); @@ -4039,14 +4094,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #define btrfs_info(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_INFO fmt, ##args) +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) #ifdef DEBUG #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) #else #define btrfs_debug(fs_info, fmt, args...) \ no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + no_printk(KERN_DEBUG fmt, ##args) #endif +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + #ifdef CONFIG_BTRFS_ASSERT __cold @@ -4127,14 +4270,7 @@ do { \ __LINE__, (errno)); \ } while (0) -#define btrfs_std_error(fs_info, errno) \ -do { \ - if ((errno)) \ - __btrfs_std_error((fs_info), __func__, \ - __LINE__, (errno), NULL); \ -} while (0) - -#define btrfs_error(fs_info, errno, fmt, args...) \ +#define btrfs_std_error(fs_info, errno, fmt, args...) \ do { \ __btrfs_std_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args); \ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a2ae42720a6a..e0941fbb913c 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); + + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if ((atomic_dec_return(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && waitqueue_active(&delayed_root->wait)) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ac3e81da6d4e..e06dd75ad13f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } +static bool merge_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head, + struct btrfs_delayed_ref_node *ref, + u64 seq) +{ + struct btrfs_delayed_ref_node *next; + bool done = false; + + next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + while (!done && &next->list != &head->ref_list) { + int mod; + struct btrfs_delayed_ref_node *next2; + + next2 = list_next_entry(next, list); + + if (next == ref) + goto next; + + if (seq && next->seq >= seq) + goto next; + + if (next->type != ref->type) + goto next; + + if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), + btrfs_delayed_node_to_tree_ref(next), + ref->type)) + goto next; + if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || + ref->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(ref), + btrfs_delayed_node_to_data_ref(next))) + goto next; + + if (ref->action == next->action) { + mod = next->ref_mod; + } else { + if (ref->ref_mod < next->ref_mod) { + swap(ref, next); + done = true; + } + mod = -next->ref_mod; + } + + drop_delayed_ref(trans, delayed_refs, head, next); + ref->ref_mod += mod; + if (ref->ref_mod == 0) { + drop_delayed_ref(trans, delayed_refs, head, ref); + done = true; + } else { + /* + * Can't have multiples of the same ref on a tree block. + */ + WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || + ref->type == BTRFS_SHARED_BLOCK_REF_KEY); + } +next: + next = next2; + } + + return done; +} + +void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head) +{ + struct btrfs_delayed_ref_node *ref; + u64 seq = 0; + + assert_spin_locked(&head->lock); + + if (list_empty(&head->ref_list)) + return; + + /* We don't have too many refs to merge for data. */ + if (head->is_data) + return; + + spin_lock(&fs_info->tree_mod_seq_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct seq_list, list); + seq = elem->seq; + } + spin_unlock(&fs_info->tree_mod_seq_lock); + + ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + while (&ref->list != &head->ref_list) { + if (seq && ref->seq >= seq) + goto next; + + if (merge_ref(trans, delayed_refs, head, ref, seq)) { + if (list_empty(&head->ref_list)) + break; + ref = list_first_entry(&head->ref_list, + struct btrfs_delayed_ref_node, + list); + continue; + } +next: + ref = list_next_entry(ref, list); + } +} + int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, list); /* No need to compare bytenr nor is_head */ - if (exist->type != ref->type || exist->no_quota != ref->no_quota || - exist->seq != ref->seq) + if (exist->type != ref->type || exist->seq != ref->seq) goto add_tail; if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || @@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref, struct btrfs_qgroup_extent_record *qrecord, - u64 bytenr, u64 num_bytes, int action, int is_data) + u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, + int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; @@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, int count_mod = 1; int must_insert_reserved = 0; + /* If reserved is provided, it must be a data extent. */ + BUG_ON(!is_data && reserved); + /* * the head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. @@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; + head_ref->qgroup_reserved = 0; + head_ref->qgroup_ref_root = 0; /* Record qgroup extent info if provided */ if (qrecord) { + if (ref_root && reserved) { + head_ref->qgroup_ref_root = ref_root; + head_ref->qgroup_reserved = reserved; + } + qrecord->bytenr = bytenr; qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; @@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { + WARN_ON(ref_root && reserved && existing->qgroup_ref_root + && existing->qgroup_reserved); update_existing_head_ref(delayed_refs, &existing->node, ref); /* * we've updated the existing ref, free the newly @@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, - int action, int no_quota) + int action) { struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_tree_ref(ref); @@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_head *head_ref, struct btrfs_delayed_ref_node *ref, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, - u64 offset, int action, int no_quota) + u64 offset, int action) { struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; @@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->action = action; ref->is_head = 0; ref->in_tree = 1; - ref->no_quota = no_quota; ref->seq = seq; full_ref = btrfs_delayed_node_to_data_ref(ref); @@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - no_quota = 0; - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) @@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, action, 0); + bytenr, num_bytes, 0, 0, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, - num_bytes, parent, ref_root, level, action, - no_quota); + num_bytes, parent, ref_root, level, action); spin_unlock(&delayed_refs->lock); return 0; @@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + u64 owner, u64 offset, u64 reserved, int action, + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_qgroup_extent_record *record = NULL; - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - no_quota = 0; - BUG_ON(extent_op && !extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); if (!ref) @@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * the spin lock */ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, - bytenr, num_bytes, action, 1); + bytenr, num_bytes, ref_root, reserved, + action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, num_bytes, parent, ref_root, owner, offset, - action, no_quota); + action); spin_unlock(&delayed_refs->lock); return 0; } +int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 ref_root, u64 bytenr, u64 num_bytes) +{ + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *ref_head; + int ret = 0; + + if (!fs_info->quota_enabled || !is_fstree(ref_root)) + return 0; + + delayed_refs = &trans->transaction->delayed_refs; + + spin_lock(&delayed_refs->lock); + ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0); + if (!ref_head) { + ret = -ENOENT; + goto out; + } + WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root); + ref_head->qgroup_ref_root = ref_root; + ref_head->qgroup_reserved = num_bytes; +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, @@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, spin_lock(&delayed_refs->lock); add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data); spin_unlock(&delayed_refs->lock); diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 13fb5e6090fe..00ed02cbf3e9 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node { unsigned int action:8; unsigned int type:8; - unsigned int no_quota:1; /* is this node still in the rbtree? */ unsigned int is_head:1; unsigned int in_tree:1; @@ -113,6 +112,17 @@ struct btrfs_delayed_ref_head { int total_ref_mod; /* + * For qgroup reserved space freeing. + * + * ref_root and reserved will be recorded after + * BTRFS_ADD_DELAYED_EXTENT is called. + * And will be used to free reserved qgroup space at + * run_delayed_refs() time. + */ + u64 qgroup_ref_root; + u64 qgroup_reserved; + + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is @@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota); + struct btrfs_delayed_extent_op *extent_op); int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 owner, u64 offset, int action, - struct btrfs_delayed_extent_op *extent_op, - int no_quota); + u64 owner, u64 offset, u64 reserved, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info, + struct btrfs_trans_handle *trans, + u64 ref_root, u64 bytenr, u64 num_bytes); int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e54dd5905cee..1e668fb7dd4c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; - /* - * Here we commit the transaction to make sure commit_total_bytes - * of all the devices are updated. - */ - trans = btrfs_attach_transaction(root); - if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - } else if (PTR_ERR(trans) != -ENOENT) { - return PTR_ERR(trans); - } - /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, @@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root, if (ret) return ret; + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); + } + btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: @@ -375,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; - ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); - if (ret) - btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); - - printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s started\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s started", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace); + ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + btrfs_wait_ordered_roots(root->fs_info, -1); /* force writing the updated state information to disk */ @@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info) static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); - if (waitqueue_active(&fs_info->replace_wait)) - wake_up(&fs_info->replace_wait); + wake_up(&fs_info->replace_wait); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, @@ -523,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - printk_in_rcu(KERN_ERR - "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", + btrfs_err_in_rcu(root->fs_info, + "btrfs_scrub_dev(%s, %llu, %s) failed %d", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -540,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, return scrub_ret; } - printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", + btrfs_info_in_rcu(root->fs_info, + "dev_replace from %s (devid %llu) to %s finished", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -586,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info->fs_devices, src_device); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ @@ -809,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data) progress = status_args->status.progress_1000; kfree(status_args); progress = div_u64(progress, 10); - printk_in_rcu(KERN_INFO - "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", + btrfs_info_in_rcu(fs_info, + "continuing dev_replace from %s (devid %llu) to %s @%u%%", dev_replace->srcdev->missing ? "<missing disk>" : rcu_str_deref(dev_replace->srcdev->name), dev_replace->srcdev->devid, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index c339d561e596..640598c0d0e7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk_ratelimited(KERN_WARNING - "BTRFS: %s checksum verify failed on %llu wanted %X found %X " - "level %d\n", + btrfs_warn_rl(fs_info, + "%s checksum verify failed on %llu wanted %X found %X " + "level %d", fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) @@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited(KERN_ERR - "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", - eb->fs_info->sb->s_id, eb->start, + btrfs_err_rl(eb->fs_info, + "parent transid verify failed on %llu wanted %llu found %llu", + eb->start, parent_transid, btrfs_header_generation(eb)); ret = 1; @@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " - "%llu %llu\n", - eb->fs_info->sb->s_id, found_start, eb->start); + btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu", + found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root->fs_info, eb)) { - printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", - eb->fs_info->sb->s_id, eb->start); + btrfs_err_rl(eb->fs_info, "bad fsid on block %llu", + eb->start); ret = -EIO; goto err; } @@ -802,6 +801,9 @@ static void run_one_async_done(struct btrfs_work *work) limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if (atomic_dec_return(&fs_info->nr_async_submits) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -1265,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, atomic_set(&root->orphan_inodes, 0); atomic_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); + atomic_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; root->last_log_commit = 0; @@ -1759,6 +1762,7 @@ static int cleaner_kthread(void *arg) int again; struct btrfs_trans_handle *trans; + set_freezable(); do { again = 0; @@ -2348,8 +2352,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, u64 bytenr = btrfs_super_log_root(disk_super); if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "BTRFS: log replay required " - "on RO media\n"); + btrfs_warn(fs_info, "log replay required on RO media"); return -EIO; } @@ -2364,12 +2367,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = read_tree_block(tree_root, bytenr, fs_info->generation + 1); if (IS_ERR(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); + btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); kfree(log_tree_root); return ret; } else if (!extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); + btrfs_err(fs_info, "failed to read log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); return -EIO; @@ -2377,7 +2380,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_error(tree_root->fs_info, ret, + btrfs_std_error(tree_root->fs_info, ret, "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2653,8 +2656,8 @@ int open_ctree(struct super_block *sb, * Read super block and check the signature bytes only */ bh = btrfs_read_dev_super(fs_devices->latest_bdev); - if (!bh) { - err = -EINVAL; + if (IS_ERR(bh)) { + err = PTR_ERR(bh); goto fail_alloc; } @@ -2937,7 +2940,7 @@ retry_root_backup: goto fail_fsdev_sysfs; } - ret = btrfs_sysfs_add_one(fs_info); + ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); goto fail_fsdev_sysfs; @@ -3117,7 +3120,7 @@ fail_cleaner: filemap_write_and_wait(fs_info->btree_inode->i_mapping); fail_sysfs: - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); fail_fsdev_sysfs: btrfs_sysfs_remove_fsid(fs_info->fs_devices); @@ -3179,8 +3182,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " - "I/O error on %s\n", + btrfs_warn_rl_in_rcu(device->dev_root->fs_info, + "lost page write due to IO error on %s", rcu_str_deref(device->name)); /* note, we dont' set_buffer_write_io_error because we have * our own ways of dealing with the IO errors @@ -3192,6 +3195,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) put_bh(bh); } +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, + struct buffer_head **bh_ret) +{ + struct buffer_head *bh; + struct btrfs_super_block *super; + u64 bytenr; + + bytenr = btrfs_sb_offset(copy_num); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) + return -EINVAL; + + bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + /* + * If we fail to read from the underlying devices, as of now + * the best option we have is to mark it EIO. + */ + if (!bh) + return -EIO; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + btrfs_super_magic(super) != BTRFS_MAGIC) { + brelse(bh); + return -EINVAL; + } + + *bh_ret = bh; + return 0; +} + + struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) { struct buffer_head *bh; @@ -3199,7 +3233,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) struct btrfs_super_block *super; int i; u64 transid = 0; - u64 bytenr; + int ret = -EINVAL; /* we would like to check all the supers, but that would make * a btrfs mount succeed after a mkfs from a different FS. @@ -3207,21 +3241,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) * later supers, using BTRFS_SUPER_MIRROR_MAX instead */ for (i = 0; i < 1; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - i_size_read(bdev->bd_inode)) - break; - bh = __bread(bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) + ret = btrfs_read_dev_one_super(bdev, i, &bh); + if (ret) continue; super = (struct btrfs_super_block *)bh->b_data; - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { - brelse(bh); - continue; - } if (!latest || btrfs_super_generation(super) > transid) { brelse(latest); @@ -3231,6 +3255,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) brelse(bh); } } + + if (!latest) + return ERR_PTR(ret); + return latest; } @@ -3299,8 +3327,9 @@ static int write_dev_supers(struct btrfs_device *device, bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) { - printk(KERN_ERR "BTRFS: couldn't get super " - "buffer head for bytenr %Lu\n", bytenr); + btrfs_err(device->dev_root->fs_info, + "couldn't get super buffer head for bytenr %llu", + bytenr); errors++; continue; } @@ -3449,22 +3478,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info) int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) { - if ((flags & (BTRFS_BLOCK_GROUP_DUP | - BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_AVAIL_ALLOC_BIT_SINGLE)) || - ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)) - return 0; + int raid_type; + int min_tolerated = INT_MAX; - if (flags & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID10)) - return 1; + if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 || + (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE)) + min_tolerated = min(min_tolerated, + btrfs_raid_array[BTRFS_RAID_SINGLE]. + tolerated_failures); - if (flags & BTRFS_BLOCK_GROUP_RAID6) - return 2; + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (raid_type == BTRFS_RAID_SINGLE) + continue; + if (!(flags & btrfs_raid_group[raid_type])) + continue; + min_tolerated = min(min_tolerated, + btrfs_raid_array[raid_type]. + tolerated_failures); + } - pr_warn("BTRFS: unknown raid type: %llu\n", flags); - return 0; + if (min_tolerated == INT_MAX) { + pr_warn("BTRFS: unknown raid flag: %llu\n", flags); + min_tolerated = 0; + } + + return min_tolerated; } int btrfs_calc_num_tolerated_disk_barrier_failures( @@ -3548,7 +3586,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) { mutex_unlock( &root->fs_info->fs_devices->device_list_mutex); - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "errors while submitting device barriers."); return ret; } @@ -3588,7 +3626,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_error(root->fs_info, -EIO, + btrfs_std_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3606,7 +3644,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) } mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_error(root->fs_info, -EIO, + btrfs_std_error(root->fs_info, -EIO, "%d errors while writing supers", total_errors); return -EIO; } @@ -3792,7 +3830,7 @@ void close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -4290,25 +4328,6 @@ again: return 0; } -static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_ordered_extent *ordered; - - spin_lock(&fs_info->trans_lock); - while (!list_empty(&cur_trans->pending_ordered)) { - ordered = list_first_entry(&cur_trans->pending_ordered, - struct btrfs_ordered_extent, - trans_list); - list_del_init(&ordered->trans_list); - spin_unlock(&fs_info->trans_lock); - - btrfs_put_ordered_extent(ordered); - spin_lock(&fs_info->trans_lock); - } - spin_unlock(&fs_info->trans_lock); -} - void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_root *root) { @@ -4320,7 +4339,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, cur_trans->state = TRANS_STATE_UNBLOCKED; wake_up(&root->fs_info->transaction_wait); - btrfs_free_pending_ordered(cur_trans, root->fs_info); btrfs_destroy_delayed_inodes(root); btrfs_assert_delayed_root_empty(root); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bdfb479ea859..adeb31830b9c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, + struct buffer_head **bh_ret); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 601d7d45d164..99a8e57da8a1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota); + int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 flags, int force); @@ -332,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl) kfree(ctl); } +#ifdef CONFIG_BTRFS_DEBUG +static void fragment_free_space(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + u64 start = block_group->key.objectid; + u64 len = block_group->key.offset; + u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? + root->nodesize : root->sectorsize; + u64 step = chunk << 1; + + while (len > chunk) { + btrfs_remove_free_space(block_group, start, chunk); + start += step; + if (len < step) + len = 0; + else + len -= step; + } +} +#endif + /* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet @@ -388,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work) u64 last = 0; u32 nritems; int ret = -ENOMEM; + bool wakeup = true; caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; @@ -400,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work) last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); +#ifdef CONFIG_BTRFS_DEBUG + /* + * If we're fragmenting we don't want to make anybody think we can + * allocate from this block group until we've had a chance to fragment + * the free space. + */ + if (btrfs_should_fragment_free_space(extent_root, block_group)) + wakeup = false; +#endif /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -441,7 +471,8 @@ next: if (need_resched() || rwsem_is_contended(&fs_info->commit_root_sem)) { - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); @@ -464,7 +495,8 @@ next: key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; - caching_ctl->progress = last; + if (wakeup) + caching_ctl->progress = last; btrfs_release_path(path); goto next; } @@ -491,7 +523,8 @@ next: if (total_found > (1024 * 1024 * 2)) { total_found = 0; - wake_up(&caching_ctl->wait); + if (wakeup) + wake_up(&caching_ctl->wait); } } path->slots[0]++; @@ -501,13 +534,27 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); - caching_ctl->progress = (u64)-1; - spin_lock(&block_group->lock); block_group->caching_ctl = NULL; block_group->cached = BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(extent_root, block_group)) { + u64 bytes_used; + + spin_lock(&block_group->space_info->lock); + spin_lock(&block_group->lock); + bytes_used = block_group->key.offset - + btrfs_block_group_used(&block_group->item); + block_group->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&block_group->lock); + spin_unlock(&block_group->space_info->lock); + fragment_free_space(extent_root, block_group); + } +#endif + + caching_ctl->progress = (u64)-1; err: btrfs_free_path(path); up_read(&fs_info->commit_root_sem); @@ -607,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } spin_unlock(&cache->lock); +#ifdef CONFIG_BTRFS_DEBUG + if (ret == 1 && + btrfs_should_fragment_free_space(fs_info->extent_root, + cache)) { + u64 bytes_used; + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + bytes_used = cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->space_info->bytes_used += bytes_used >> 1; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fragment_free_space(fs_info->extent_root, cache); + } +#endif mutex_unlock(&caching_ctl->mutex); wake_up(&caching_ctl->wait); @@ -2009,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 owner, u64 offset, - int no_quota) + u64 root_objectid, u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -2022,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + BTRFS_ADD_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, - num_bytes, - parent, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_REF, NULL, no_quota); + num_bytes, parent, root_objectid, + owner, offset, 0, + BTRFS_ADD_DELAYED_REF, NULL); } return ret; } @@ -2048,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 num_bytes = node->num_bytes; u64 refs; int ret; - int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled) - no_quota = 1; - path->reada = 1; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ @@ -2291,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent, ref_root, extent_op->flags_to_set, &extent_op->key, - ref->level, &ins, - node->no_quota); + ref->level, &ins); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, @@ -2345,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, node->num_bytes); } } + + /* Also free its reserved qgroup space */ + btrfs_qgroup_free_delayed_ref(root->fs_info, + head->qgroup_ref_root, + head->qgroup_reserved); return ret; } @@ -2433,7 +2495,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } + /* + * We need to try and merge add/drops of the same ref since we + * can run into issues with relocate dropping the implicit ref + * and then it being added back again before the drop can + * finish. If we merged anything we need to re-loop so we can + * get a good ref. + * Or we can get node references of the same type that weren't + * merged when created due to bumps in the tree mod seq, and + * we need to merge them to prevent adding an inline extent + * backref before dropping it (triggering a BUG_ON at + * insert_inline_extent_backref()). + */ spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, + locked_ref); /* * locked_ref is the head node, so we have to go one @@ -3109,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int level; int ret = 0; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, int); + u64, u64, u64, u64, u64, u64); if (btrfs_test_is_dummy_root(root)) @@ -3150,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, key.objectid, - key.offset, 1); + key.offset); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, - parent, ref_root, level - 1, 0, - 1); + parent, ref_root, level - 1, 0); if (ret) goto fail; } @@ -3339,6 +3414,15 @@ again: spin_unlock(&block_group->lock); /* + * We hit an ENOSPC when setting up the cache in this transaction, just + * skip doing the setup, we've already cleared the cache so we're safe. + */ + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { + ret = -ENOSPC; + goto out_put; + } + + /* * Try to preallocate enough space based on how big the block group is. * Keep in mind this has to include any pinned space which could end up * taking up quite a bit since it's not folded into the other space @@ -3351,16 +3435,26 @@ again: num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages, num_pages); + ret = btrfs_check_data_free_space(inode, 0, num_pages); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + /* + * Our cache requires contiguous chunks so that we don't modify a bunch + * of metadata or split extents when writing the cache out, which means + * we can enospc if we are heavily fragmented in addition to just normal + * out of space conditions. So if we hit this just skip setting up any + * other block groups for this transaction, maybe we'll unpin enough + * space the next time around. + */ if (!ret) dcs = BTRFS_DC_SETUP; - btrfs_free_reserved_data_space(inode, num_pages); + else if (ret == -ENOSPC) + set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags); + btrfs_free_reserved_data_space(inode, 0, num_pages); out_put: iput(inode); @@ -3746,6 +3840,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_readonly = 0; found->bytes_may_use = 0; found->full = 0; + found->max_extent_size = 0; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3822,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; u64 target; - u64 tmp; + u64 raid_type; + u64 allowed = 0; /* * see if restripe for this chunk_type is in progress, if so @@ -3840,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) spin_unlock(&root->fs_info->balance_lock); /* First, mask out the RAID levels which aren't possible */ - if (num_devices == 1) - flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID5); - if (num_devices < 3) - flags &= ~BTRFS_BLOCK_GROUP_RAID6; - if (num_devices < 4) - flags &= ~BTRFS_BLOCK_GROUP_RAID10; - - tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); - flags &= ~tmp; - - if (tmp & BTRFS_BLOCK_GROUP_RAID6) - tmp = BTRFS_BLOCK_GROUP_RAID6; - else if (tmp & BTRFS_BLOCK_GROUP_RAID5) - tmp = BTRFS_BLOCK_GROUP_RAID5; - else if (tmp & BTRFS_BLOCK_GROUP_RAID10) - tmp = BTRFS_BLOCK_GROUP_RAID10; - else if (tmp & BTRFS_BLOCK_GROUP_RAID1) - tmp = BTRFS_BLOCK_GROUP_RAID1; - else if (tmp & BTRFS_BLOCK_GROUP_RAID0) - tmp = BTRFS_BLOCK_GROUP_RAID0; - - return extended_to_chunk(flags | tmp); + for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { + if (num_devices >= btrfs_raid_array[raid_type].devs_min) + allowed |= btrfs_raid_group[raid_type]; + } + allowed &= flags; + + if (allowed & BTRFS_BLOCK_GROUP_RAID6) + allowed = BTRFS_BLOCK_GROUP_RAID6; + else if (allowed & BTRFS_BLOCK_GROUP_RAID5) + allowed = BTRFS_BLOCK_GROUP_RAID5; + else if (allowed & BTRFS_BLOCK_GROUP_RAID10) + allowed = BTRFS_BLOCK_GROUP_RAID10; + else if (allowed & BTRFS_BLOCK_GROUP_RAID1) + allowed = BTRFS_BLOCK_GROUP_RAID1; + else if (allowed & BTRFS_BLOCK_GROUP_RAID0) + allowed = BTRFS_BLOCK_GROUP_RAID0; + + flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK; + + return extended_to_chunk(flags | allowed); } static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) @@ -3903,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) return ret; } -/* - * This will check the space that the inode allocates from to make sure we have - * enough space for bytes. - */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) +int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4006,7 +4093,8 @@ commit_trans: if (IS_ERR(trans)) return PTR_ERR(trans); if (have_pinned_space >= 0 || - trans->transaction->have_free_bgs || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, + &trans->transaction->flags) || need_commit > 0) { ret = btrfs_commit_transaction(trans, root); if (ret) @@ -4028,38 +4116,86 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } - ret = btrfs_qgroup_reserve(root, write_bytes); - if (ret) - goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); -out: spin_unlock(&data_sinfo->lock); return ret; } /* - * Called if we need to clear a data reservation for this inode. + * New check_data_free_space() with ability for precious data reservation + * Will replace old btrfs_check_data_free_space(), but for patch split, + * add a new function first and then replace it. + */ +int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + /* align the range */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); + + ret = btrfs_alloc_data_chunk_ondemand(inode, len); + if (ret < 0) + return ret; + + /* + * Use new btrfs_qgroup_reserve_data to reserve precious data space + * + * TODO: Find a good method to avoid reserve data space for NOCOW + * range, but don't impact performance on quota disable case. + */ + ret = btrfs_qgroup_reserve_data(inode, start, len); + return ret; +} + +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will *NOT* use accurate qgroup reserved space API, just for case + * which we can't sleep and is sure it won't affect qgroup reserved space. + * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, + u64 len) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; - /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, root->sectorsize); + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->sectorsize) - + round_down(start, root->sectorsize); + start = round_down(start, root->sectorsize); data_sinfo = root->fs_info->data_sinfo; spin_lock(&data_sinfo->lock); - WARN_ON(data_sinfo->bytes_may_use < bytes); - data_sinfo->bytes_may_use -= bytes; + if (WARN_ON(data_sinfo->bytes_may_use < len)) + data_sinfo->bytes_may_use = 0; + else + data_sinfo->bytes_may_use -= len; trace_btrfs_space_reservation(root->fs_info, "space_info", - data_sinfo->flags, bytes, 0); + data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } +/* + * Called if we need to clear a data reservation for this inode + * Normally in a error case. + * + * This one will handle the per-indoe data rsv map for accurate reserved + * space framework. + */ +void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) +{ + btrfs_free_reserved_data_space_noquota(inode, start, len); + btrfs_qgroup_free_data(inode, start, len); +} + static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -4891,13 +5027,9 @@ static struct btrfs_block_rsv *get_block_rsv( { struct btrfs_block_rsv *block_rsv = NULL; - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->csum_root && trans->adding_csums) - block_rsv = trans->block_rsv; - - if (root == root->fs_info->uuid_root) + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || + (root == root->fs_info->csum_root && trans->adding_csums) || + (root == root->fs_info->uuid_root)) block_rsv = trans->block_rsv; if (!block_rsv) @@ -5340,7 +5472,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ num_bytes = 3 * root->nodesize; - ret = btrfs_qgroup_reserve(root, num_bytes); + ret = btrfs_qgroup_reserve_meta(root, num_bytes); if (ret) return ret; } else { @@ -5358,10 +5490,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret == -ENOSPC && use_global_rsv) ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); - if (ret) { - if (*qgroup_reserved) - btrfs_qgroup_free(root, *qgroup_reserved); - } + if (ret && *qgroup_reserved) + btrfs_qgroup_free_meta(root, *qgroup_reserved); return ret; } @@ -5522,15 +5652,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve_meta(root, + nr_extents * root->nodesize); if (ret) goto out_fail; } ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, nr_extents * root->nodesize); + btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); goto out_fail; } @@ -5653,41 +5783,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) } /** - * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc + * btrfs_delalloc_reserve_space - reserve data and metadata space for + * delalloc * @inode: inode we're writing to - * @num_bytes: the number of bytes we want to allocate + * @start: start range we are writing to + * @len: how long the range we are writing to + * + * TODO: This function will finally replace old btrfs_delalloc_reserve_space() * * This will do the following things * - * o reserve space in the data space info for num_bytes - * o reserve space in the metadata space info based on number of outstanding + * o reserve space in data space info for num bytes + * and reserve precious corresponding qgroup space + * (Done in check_data_free_space) + * + * o reserve space for metadata space, based on the number of outstanding * extents and how much csums will be needed - * o add to the inodes ->delalloc_bytes + * also reserve metadata space in a per root over-reserve method. + * o add to the inodes->delalloc_bytes * o add it to the fs_info's delalloc inodes list. + * (Above 3 all done in delalloc_reserve_metadata) * - * This will return 0 for success and -ENOSPC if there is no space left. + * Return 0 for success + * Return <0 for error(-ENOSPC or -EQUOT) */ -int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); - if (ret) - return ret; - - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); - if (ret) { - btrfs_free_reserved_data_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, start, len); + if (ret < 0) return ret; - } - - return 0; + ret = btrfs_delalloc_reserve_metadata(inode, len); + if (ret < 0) + btrfs_free_reserved_data_space(inode, start, len); + return ret; } /** * btrfs_delalloc_release_space - release data and metadata space for delalloc * @inode: inode we're releasing space for - * @num_bytes: the number of bytes we want to free up + * @start: start position of the space already reserved + * @len: the len of the space already reserved * * This must be matched with a call to btrfs_delalloc_reserve_space. This is * called in the case that we don't need the metadata AND data reservations @@ -5696,11 +5833,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) * This function will release the metadata space that was not used and will * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes * list if there are no delalloc bytes left. + * Also it will handle the qgroup reserved space. */ -void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) { - btrfs_delalloc_release_metadata(inode, num_bytes); - btrfs_free_reserved_data_space(inode, num_bytes); + btrfs_delalloc_release_metadata(inode, len); + btrfs_free_reserved_data_space(inode, start, len); } static int update_block_group(struct btrfs_trans_handle *trans, @@ -6065,6 +6203,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } +/* + * Returns the free cluster for the given space info and sets empty_cluster to + * what it should be based on the mount options. + */ +static struct btrfs_free_cluster * +fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, + u64 *empty_cluster) +{ + struct btrfs_free_cluster *ret = NULL; + bool ssd = btrfs_test_opt(root, SSD); + + *empty_cluster = 0; + if (btrfs_mixed_space_info(space_info)) + return ret; + + if (ssd) + *empty_cluster = 2 * 1024 * 1024; + if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { + ret = &root->fs_info->meta_alloc_cluster; + if (!ssd) + *empty_cluster = 64 * 1024; + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { + ret = &root->fs_info->data_alloc_cluster; + } + + return ret; +} + static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, const bool return_free_space) { @@ -6072,7 +6238,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_block_group_cache *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + struct btrfs_free_cluster *cluster = NULL; u64 len; + u64 total_unpinned = 0; + u64 empty_cluster = 0; bool readonly; while (start <= end) { @@ -6081,8 +6250,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, start >= cache->key.objectid + cache->key.offset) { if (cache) btrfs_put_block_group(cache); + total_unpinned = 0; cache = btrfs_lookup_block_group(fs_info, start); BUG_ON(!cache); /* Logic error */ + + cluster = fetch_cluster_info(root, + cache->space_info, + &empty_cluster); + empty_cluster <<= 1; } len = cache->key.objectid + cache->key.offset - start; @@ -6095,12 +6270,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, } start += len; + total_unpinned += len; space_info = cache->space_info; + /* + * If this space cluster has been marked as fragmented and we've + * unpinned enough in this block group to potentially allow a + * cluster to be created inside of it go ahead and clear the + * fragmented check. + */ + if (cluster && cluster->fragmented && + total_unpinned > empty_cluster) { + spin_lock(&cluster->lock); + cluster->fragmented = 0; + spin_unlock(&cluster->lock); + } + spin_lock(&space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; space_info->bytes_pinned -= len; + space_info->max_extent_size = 0; percpu_counter_add(&space_info->total_bytes_pinned, -len); if (cache->ro) { space_info->bytes_readonly += len; @@ -6233,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - int no_quota = node->no_quota; u32 item_size; u64 refs; u64 bytenr = node->bytenr; @@ -6242,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); - if (!info->quota_enabled || !is_fstree(root_objectid)) - no_quota = 1; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -6570,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, buf->start, buf->len, parent, root->root_key.objectid, btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL, 0); + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); /* -ENOMEM */ } @@ -6618,7 +6804,7 @@ out: /* Can return -ENOMEM */ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int no_quota) + u64 owner, u64 offset) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -6641,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, (int)owner, - BTRFS_DROP_DELAYED_REF, NULL, no_quota); + BTRFS_DROP_DELAYED_REF, NULL); } else { ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, num_bytes, parent, root_objectid, owner, - offset, BTRFS_DROP_DELAYED_REF, - NULL, no_quota); + offset, 0, + BTRFS_DROP_DELAYED_REF, NULL); } return ret; } @@ -6833,7 +7019,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; u64 max_extent_size = 0; - int empty_cluster = 2 * 1024 * 1024; + u64 empty_cluster = 0; struct btrfs_space_info *space_info; int loop = 0; int index = __get_raid_index(flags); @@ -6843,6 +7029,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool failed_alloc = false; bool use_cluster = true; bool have_caching_bg = false; + bool orig_have_caching_bg = false; + bool full_search = false; WARN_ON(num_bytes < root->sectorsize); ins->type = BTRFS_EXTENT_ITEM_KEY; @@ -6858,36 +7046,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } /* - * If the space info is for both data and metadata it means we have a - * small filesystem and we can't use the clustering stuff. + * If our free space is heavily fragmented we may not be able to make + * big contiguous allocations, so instead of doing the expensive search + * for free space, simply return ENOSPC with our max_extent_size so we + * can go ahead and search for a more manageable chunk. + * + * If our max_extent_size is large enough for our allocation simply + * disable clustering since we will likely not be able to find enough + * space to create a cluster and induce latency trying. */ - if (btrfs_mixed_space_info(space_info)) - use_cluster = false; - - if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { - last_ptr = &root->fs_info->meta_alloc_cluster; - if (!btrfs_test_opt(root, SSD)) - empty_cluster = 64 * 1024; - } - - if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && - btrfs_test_opt(root, SSD)) { - last_ptr = &root->fs_info->data_alloc_cluster; + if (unlikely(space_info->max_extent_size)) { + spin_lock(&space_info->lock); + if (space_info->max_extent_size && + num_bytes > space_info->max_extent_size) { + ins->offset = space_info->max_extent_size; + spin_unlock(&space_info->lock); + return -ENOSPC; + } else if (space_info->max_extent_size) { + use_cluster = false; + } + spin_unlock(&space_info->lock); } + last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) hint_byte = last_ptr->window_start; + if (last_ptr->fragmented) { + /* + * We still set window_start so we can keep track of the + * last place we found an allocation to try and save + * some time. + */ + hint_byte = last_ptr->window_start; + use_cluster = false; + } spin_unlock(&last_ptr->lock); } search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - - if (!last_ptr) - empty_cluster = 0; - if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); @@ -6922,6 +7121,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, } search: have_caching_bg = false; + if (index == 0 || index == __get_raid_index(flags)) + full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups[index], list) { @@ -6955,6 +7156,7 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { + have_caching_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; @@ -6969,7 +7171,7 @@ have_block_group: * Ok we want to try and use the cluster allocator, so * lets look there */ - if (last_ptr) { + if (last_ptr && use_cluster) { struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* @@ -7095,6 +7297,16 @@ refill_cluster: } unclustered_alloc: + /* + * We are doing an unclustered alloc, set the fragmented flag so + * we don't bother trying to setup a cluster again until we get + * more space. + */ + if (unlikely(last_ptr)) { + spin_lock(&last_ptr->lock); + last_ptr->fragmented = 1; + spin_unlock(&last_ptr->lock); + } spin_lock(&block_group->free_space_ctl->tree_lock); if (cached && block_group->free_space_ctl->free_space < @@ -7127,8 +7339,6 @@ unclustered_alloc: failed_alloc = true; goto have_block_group; } else if (!offset) { - if (!cached) - have_caching_bg = true; goto loop; } checks: @@ -7169,6 +7379,10 @@ loop: } up_read(&space_info->groups_sem); + if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg + && !orig_have_caching_bg) + orig_have_caching_bg = true; + if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) goto search; @@ -7185,7 +7399,20 @@ loop: */ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { index = 0; - loop++; + if (loop == LOOP_CACHING_NOWAIT) { + /* + * We want to skip the LOOP_CACHING_WAIT step if we + * don't have any unached bgs and we've alrelady done a + * full search through. + */ + if (orig_have_caching_bg || !full_search) + loop = LOOP_CACHING_WAIT; + else + loop = LOOP_ALLOC_CHUNK; + } else { + loop++; + } + if (loop == LOOP_ALLOC_CHUNK) { struct btrfs_trans_handle *trans; int exist = 0; @@ -7203,6 +7430,15 @@ loop: ret = do_chunk_alloc(trans, root, flags, CHUNK_ALLOC_FORCE); + + /* + * If we can't allocate a new chunk we've already looped + * through at least once, move on to the NO_EMPTY_SIZE + * case. + */ + if (ret == -ENOSPC) + loop = LOOP_NO_EMPTY_SIZE; + /* * Do not bail out on ENOSPC since we * can do more things. @@ -7219,6 +7455,15 @@ loop: } if (loop == LOOP_NO_EMPTY_SIZE) { + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (empty_size == 0 && + empty_cluster == 0) { + ret = -ENOSPC; + goto out; + } empty_size = 0; empty_cluster = 0; } @@ -7227,11 +7472,20 @@ loop: } else if (!ins->objectid) { ret = -ENOSPC; } else if (ins->objectid) { + if (!use_cluster && last_ptr) { + spin_lock(&last_ptr->lock); + last_ptr->window_start = ins->objectid; + spin_unlock(&last_ptr->lock); + } ret = 0; } out: - if (ret == -ENOSPC) + if (ret == -ENOSPC) { + spin_lock(&space_info->lock); + space_info->max_extent_size = max_extent_size; + spin_unlock(&space_info->lock); ins->offset = max_extent_size; + } return ret; } @@ -7280,7 +7534,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 empty_size, u64 hint_byte, struct btrfs_key *ins, int is_data, int delalloc) { - bool final_tried = false; + bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; @@ -7429,8 +7683,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, - int level, struct btrfs_key *ins, - int no_quota) + int level, struct btrfs_key *ins) { int ret; struct btrfs_fs_info *fs_info = root->fs_info; @@ -7511,7 +7764,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 root_objectid, u64 owner, - u64 offset, struct btrfs_key *ins) + u64 offset, u64 ram_bytes, + struct btrfs_key *ins) { int ret; @@ -7520,7 +7774,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, ins->offset, 0, root_objectid, owner, offset, - BTRFS_ADD_DELAYED_EXTENT, NULL, 0); + ram_bytes, BTRFS_ADD_DELAYED_EXTENT, + NULL); return ret; } @@ -7734,7 +7989,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); + extent_op); if (ret) goto out_free_delayed; } @@ -8275,14 +8530,15 @@ skip: ret = account_shared_subtree(trans, root, next, generation, level - 1); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "Error " "%d accounting shared subtree. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0, 0); + root->root_key.objectid, level - 1, 0); BUG_ON(ret); /* -ENOMEM */ } btrfs_tree_unlock(next); @@ -8367,10 +8623,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = account_leaf_items(trans, root, eb); if (ret) { - printk_ratelimited(KERN_ERR "BTRFS: %s Error " + btrfs_err_rl(root->fs_info, + "error " "%d accounting leaf items. Quota " - "is out of sync, rescan required.\n", - root->fs_info->sb->s_id, ret); + "is out of sync, rescan required.", + ret); } } /* make block locked assertion in clean_tree_block happy */ @@ -8692,7 +8949,7 @@ out: if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); if (err && err != -EAGAIN) - btrfs_std_error(root->fs_info, err); + btrfs_std_error(root->fs_info, err, NULL); return err; } @@ -8880,7 +9137,7 @@ again: * back off and let this transaction commit */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (trans->transaction->dirty_bg_run) { + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { u64 transid = trans->transid; mutex_unlock(&root->fs_info->ro_block_group_mutex); @@ -9630,6 +9887,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(root, cache)) { + u64 new_bytes_used = size - bytes_used; + + bytes_used += new_bytes_used >> 1; + fragment_free_space(root, cache); + } +#endif /* * Call to ensure the corresponding space_info object is created and * assigned to our block group, but don't update its counters just yet. @@ -10370,8 +10635,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* - * Make sure counter is updated before we wake up - * waiters. + * Make sure counter is updated before we wake up waiters. */ smp_mb(); if (waitqueue_active(&root->subv_writers->wait)) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 032abfbebe76..9abe18763a7f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, inode = tree->mapping->host; isize = i_size_read(inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - printk_ratelimited(KERN_DEBUG - "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", + btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + "%s: ino %llu isize %llu odd range [%llu,%llu]", caller, btrfs_ino(inode), isize, start, end); } } @@ -131,6 +131,25 @@ struct extent_page_data { unsigned int sync_io:1; }; +static void add_extent_changeset(struct extent_state *state, unsigned bits, + struct extent_changeset *changeset, + int set) +{ + int ret; + + if (!changeset) + return; + if (set && (state->state & bits) == bits) + return; + if (!set && (state->state & bits) == 0) + return; + changeset->bytes_changed += state->end - state->start + 1; + ret = ulist_add(changeset->range_changed, state->start, state->end, + GFP_ATOMIC); + /* ENOMEM */ + BUG_ON(ret < 0); +} + static noinline void flush_write_bio(void *data); static inline struct btrfs_fs_info * tree_fs_info(struct extent_io_tree *tree) @@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, unsigned *bits); + struct extent_state *state, unsigned *bits, + struct extent_changeset *changeset); /* * insert an extent_state struct into the tree. 'bits' are set on the @@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, struct rb_node ***p, struct rb_node **parent, - unsigned *bits) + unsigned *bits, struct extent_changeset *changeset) { struct rb_node *node; @@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree, state->start = start; state->end = end; - set_state_bits(tree, state, bits); + set_state_bits(tree, state, bits, changeset); node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); if (node) { @@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits, int wake) + unsigned *bits, int wake, + struct extent_changeset *changeset) { struct extent_state *next; unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; @@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, tree->dirty_bytes -= range; } clear_state_cb(tree, state, bits); + add_extent_changeset(state, bits_to_clear, changeset, 0); state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); @@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, int wake, int delete, - struct extent_state **cached_state, - gfp_t mask) +static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached_state, + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; @@ -671,7 +693,8 @@ hit_next: if (err) goto out; if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake, + changeset); goto next; } goto search_again; @@ -692,13 +715,13 @@ hit_next: if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, &bits, wake); + clear_state_bit(tree, prealloc, &bits, wake, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake, changeset); next: if (last_end == (u64)-1) goto out; @@ -789,7 +812,7 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - unsigned *bits) + unsigned *bits, struct extent_changeset *changeset) { unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; @@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree, u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } + add_extent_changeset(state, bits_to_set, changeset, 1); state->state |= bits_to_set; } @@ -835,7 +859,7 @@ static int __must_check __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned exclusive_bits, u64 *failed_start, struct extent_state **cached_state, - gfp_t mask) + gfp_t mask, struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -873,7 +897,7 @@ again: prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits); + &p, &parent, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -899,7 +923,7 @@ hit_next: goto out; } - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -945,7 +969,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -980,7 +1004,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits); + NULL, NULL, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1008,7 +1032,7 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits, changeset); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { return __set_extent_bit(tree, start, end, bits, 0, failed_start, - cached_state, mask); + cached_state, mask, NULL); } @@ -1111,7 +1135,7 @@ again: goto out; } err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits); + &p, &parent, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1130,9 +1154,9 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1171,9 +1195,10 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits); + set_state_bits(tree, state, &bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0); + state = clear_state_bit(tree, state, &clear_bits, 0, + NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1208,7 +1233,7 @@ hit_next: * the later extent. */ err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits); + NULL, NULL, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1233,9 +1258,9 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits); + set_state_bits(tree, prealloc, &bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0); + clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); prealloc = NULL; goto out; } @@ -1274,6 +1299,30 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, NULL, mask); } +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset) +{ + /* + * We don't support EXTENT_LOCKED yet, as current changeset will + * record any bits changed, so for EXTENT_LOCKED case, it will + * either fail with -EEXIST or changeset will record the whole + * range. + */ + BUG_ON(bits & EXTENT_LOCKED); + + return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask, + changeset); +} + +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, int wake, int delete, + struct extent_state **cached, gfp_t mask) +{ + return __clear_extent_bit(tree, start, end, bits, wake, delete, + cached, mask, NULL); +} + int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { @@ -1285,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset) +{ + /* + * Don't support EXTENT_LOCKED case, same reason as + * set_record_extent_bits(). + */ + BUG_ON(bits & EXTENT_LOCKED); + + return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask, + changeset); +} + int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state, gfp_t mask) { @@ -1343,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, while (1) { err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, EXTENT_LOCKED, &failed_start, - cached_state, GFP_NOFS); + cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); start = failed_start; @@ -1365,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, - &failed_start, NULL, GFP_NOFS); + &failed_start, NULL, GFP_NOFS, NULL); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -2078,8 +2141,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } - printk_ratelimited_in_rcu(KERN_INFO - "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n", + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", btrfs_ino(inode), start, rcu_str_deref(dev->name), sector); bio_put(bio); @@ -3070,8 +3133,12 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - unlock_extent_cached(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); + if (parent_locked) + free_extent_state(cached); + else + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -5566,13 +5633,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " - "len %lu dst len %lu\n", src_offset, len, dst->len); + btrfs_err(dst->fs_info, + "memmove bogus src_offset %lu move " + "len %lu dst len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " - "len %lu dst len %lu\n", dst_offset, len, dst->len); + btrfs_err(dst->fs_info, + "memmove bogus dst_offset %lu move " + "len %lu dst len %lu", dst_offset, len, dst->len); BUG_ON(1); } @@ -5612,13 +5681,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " - "len %lu len %lu\n", src_offset, len, dst->len); + btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move " + "len %lu len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " - "len %lu len %lu\n", dst_offset, len, dst->len); + btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move " + "len %lu len %lu", dst_offset, len, dst->len); BUG_ON(1); } if (dst_offset < src_offset) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c668f36898d3..f4c1ae11855f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include <linux/rbtree.h> +#include "ulist.h" /* bits for the extent state */ #define EXTENT_DIRTY (1U << 0) @@ -18,6 +19,7 @@ #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) +#define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) @@ -161,6 +163,17 @@ struct extent_buffer { #endif }; +/* + * Structure to record how many bytes and which ranges are set/cleared + */ +struct extent_changeset { + /* How many bytes are set/cleared in this operation */ + u64 bytes_changed; + + /* Changed ranges */ + struct ulist *range_changed; +}; + static inline void extent_set_compress_type(unsigned long *bio_flags, int compress_type) { @@ -210,11 +223,17 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask); +int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask); +int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + unsigned bits, gfp_t mask, + struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8c6f247ba81d..6bd5ce9d75f0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -847,7 +847,7 @@ next_slot: disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, - start - extent_offset, 1); + start - extent_offset); BUG_ON(ret); /* -ENOMEM */ } key.offset = start; @@ -925,7 +925,7 @@ delete_extent_item: disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - - extent_offset, 0); + extent_offset); BUG_ON(ret); /* -ENOMEM */ inode_sub_bytes(inode, extent_end - key.offset); @@ -1204,7 +1204,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 1); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ if (split == start) { @@ -1231,7 +1231,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } other_start = 0; @@ -1248,7 +1248,7 @@ again: del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 0, root->root_key.objectid, - ino, orig_offset, 0); + ino, orig_offset); BUG_ON(ret); /* -ENOMEM */ } if (del_nr == 0) { @@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, u64 release_bytes = 0; u64 lockstart; u64 lockend; - unsigned long first_index; size_t num_written = 0; int nrptrs; int ret = 0; @@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, if (!pages) return -ENOMEM; - first_index = pos >> PAGE_CACHE_SHIFT; - while (iov_iter_count(i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(i), @@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); - if (ret == -ENOSPC && - (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | - BTRFS_INODE_PREALLOC))) { + + if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) { ret = check_can_nocow(inode, pos, &write_bytes); + if (ret < 0) + break; if (ret > 0) { + /* + * For nodata cow case, no need to reserve + * data space. + */ only_release_metadata = true; /* * our prealloc extent may be smaller than @@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = 0; - } else { - ret = -ENOSPC; + goto reserve_metadata; } } - - if (ret) + ret = btrfs_check_data_free_space(inode, pos, write_bytes); + if (ret < 0) break; +reserve_metadata: ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); if (ret) { if (!only_release_metadata) - btrfs_free_reserved_data_space(inode, - reserve_bytes); + btrfs_free_reserved_data_space(inode, pos, + write_bytes); else btrfs_end_write_no_snapshoting(root); break; @@ -1603,12 +1604,17 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); } - if (only_release_metadata) + if (only_release_metadata) { btrfs_delalloc_release_metadata(inode, release_bytes); - else - btrfs_delalloc_release_space(inode, + } else { + u64 __pos; + + __pos = round_down(pos, root->sectorsize) + + (dirty_pages << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, __pos, release_bytes); + } } release_bytes = dirty_pages << PAGE_CACHE_SHIFT; @@ -1660,7 +1666,7 @@ again: btrfs_end_write_no_snapshoting(root); btrfs_delalloc_release_metadata(inode, release_bytes); } else { - btrfs_delalloc_release_space(inode, release_bytes); + btrfs_delalloc_release_space(inode, pos, release_bytes); } } @@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 drop_end; int ret = 0; int err = 0; - int rsv_count; + unsigned int rsv_count; bool same_page; bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); u64 ino_size; @@ -2488,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) trans->block_rsv = &root->fs_info->trans_block_rsv; /* + * If we are using the NO_HOLES feature we might have had already an + * hole that overlaps a part of the region [lockstart, lockend] and + * ends at (or beyond) lockend. Since we have no file extent items to + * represent holes, drop_end can be less than lockend and so we must + * make sure we have an extent map representing the existing hole (the + * call to __btrfs_drop_extents() might have dropped the existing extent + * map representing the existing hole), otherwise the fast fsync path + * will not record the existence of the hole region + * [existing_hole_start, lockend]. + */ + if (drop_end <= lockend) + drop_end = lockend + 1; + /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). @@ -2541,17 +2560,61 @@ out_only_mutex: return err; } +/* Helper structure to record which range is already reserved */ +struct falloc_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Helper function to add falloc range + * + * Caller should have locked the larger range of extent containing + * [start, len) + */ +static int add_falloc_range(struct list_head *head, u64 start, u64 len) +{ + struct falloc_range *prev = NULL; + struct falloc_range *range = NULL; + + if (list_empty(head)) + goto insert; + + /* + * As fallocate iterate by bytenr order, we only need to check + * the last range. + */ + prev = list_entry(head->prev, struct falloc_range, list); + if (prev->start + prev->len == start) { + prev->len += len; + return 0; + } +insert: + range = kmalloc(sizeof(*range), GFP_NOFS); + if (!range) + return -ENOMEM; + range->start = start; + range->len = len; + list_add_tail(&range->list, head); + return 0; +} + static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; + struct falloc_range *range; + struct falloc_range *tmp; + struct list_head reserve_list; u64 cur_offset; u64 last_byte; u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; + u64 actual_end = 0; struct extent_map *em; int blocksize = BTRFS_I(inode)->root->sectorsize; int ret; @@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode, return btrfs_punch_hole(inode, offset, len); /* - * Make sure we have enough space before we do the - * allocation. + * Only trigger disk allocation, don't trigger qgroup reserve + * + * For qgroup space, it will be checked later. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); - if (ret) + ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start); + if (ret < 0) return ret; mutex_lock(&inode->i_mutex); @@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret) goto out; + /* + * TODO: Move these two operations after we have checked + * accurate reserved space, or fallocate can still fail but + * with page truncated or size expanded. + * + * But that's a minor problem and won't do much harm BTW. + */ if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, i_size_read(inode), alloc_start); @@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode, } } + /* First, check if we exceed the qgroup limit */ + INIT_LIST_HEAD(&reserve_list); cur_offset = alloc_start; while (1) { - u64 actual_end; - em = btrfs_get_extent(inode, NULL, 0, cur_offset, alloc_end - cur_offset, 0); if (IS_ERR_OR_NULL(em)) { @@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = min(extent_map_end(em), alloc_end); actual_end = min_t(u64, extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); - if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = btrfs_prealloc_file_range(inode, mode, cur_offset, - last_byte - cur_offset, - 1 << inode->i_blkbits, - offset + len, - &alloc_hint); - } else if (actual_end > inode->i_size && - !(mode & FALLOC_FL_KEEP_SIZE)) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(inode)->root; - - /* - * We didn't need to allocate any more space, but we - * still extended the size of the file so we need to - * update i_size and the inode item. - */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - } else { - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, - NULL); - ret = btrfs_update_inode(trans, root, inode); - if (ret) - btrfs_end_transaction(trans, root); - else - ret = btrfs_end_transaction(trans, - root); + ret = add_falloc_range(&reserve_list, cur_offset, + last_byte - cur_offset); + if (ret < 0) { + free_extent_map(em); + break; } + ret = btrfs_qgroup_reserve_data(inode, cur_offset, + last_byte - cur_offset); + if (ret < 0) + break; } free_extent_map(em); - if (ret < 0) - break; - cur_offset = last_byte; - if (cur_offset >= alloc_end) { - ret = 0; + if (cur_offset >= alloc_end) break; + } + + /* + * If ret is still 0, means we're OK to fallocate. + * Or just cleanup the list and exit. + */ + list_for_each_entry_safe(range, tmp, &reserve_list, list) { + if (!ret) + ret = btrfs_prealloc_file_range(inode, mode, + range->start, + range->len, 1 << inode->i_blkbits, + offset + len, &alloc_hint); + list_del(&range->list); + kfree(range); + } + if (ret < 0) + goto out_unlock; + + if (actual_end > inode->i_size && + !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* + * We didn't need to allocate any more space, but we + * still extended the size of the file so we need to + * update i_size and the inode item. + */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, root); } } +out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); out: + /* + * As we waited the extent range, the data_rsv_map must be empty + * in the range, as written data range will be released from it. + * And for prealloacted extent, it will also be released when + * its metadata is written. + * So this is completely used as cleanup. + */ + btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); mutex_unlock(&inode->i_mutex); /* Let go of our reservation. */ - btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_start, + alloc_end - alloc_start); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ed05da1b977e..85a1f8621b51 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { - printk_ratelimited(KERN_ERR "BTRFS: space cache generation " - "(%Lu) does not match inode (%Lu)\n", *gen, - generation); + btrfs_err_rl(io_ctl->root->fs_info, + "space cache generation (%llu) does not match inode (%llu)", + *gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); if (val != crc) { - printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " - "space cache\n"); + btrfs_err_rl(io_ctl->root->fs_info, + "csum mismatch on free space cache"); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -1215,7 +1215,7 @@ out: * @offset - the offset for the key we'll insert * * This function writes out a free space cache struct to disk for quick recovery - * on mount. This will return 0 if it was successfull in writing the cache out, + * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, @@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, */ static int search_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info, u64 *offset, - u64 *bytes) + u64 *bytes, bool for_alloc) { unsigned long found_bits = 0; unsigned long max_bits = 0; @@ -1738,11 +1738,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, unsigned long next_zero; unsigned long extent_bits; + /* + * Skip searching the bitmap if we don't have a contiguous section that + * is large enough for this allocation. + */ + if (for_alloc && + bitmap_info->max_extent_size && + bitmap_info->max_extent_size < *bytes) { + *bytes = bitmap_info->max_extent_size; + return -1; + } + i = offset_to_bit(bitmap_info->offset, ctl->unit, max_t(u64, *offset, bitmap_info->offset)); bits = bytes_to_bits(*bytes, ctl->unit); for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { + if (for_alloc && bits == 1) { + found_bits = 1; + break; + } next_zero = find_next_zero_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); extent_bits = next_zero - i; @@ -1762,6 +1777,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, } *bytes = (u64)(max_bits) * ctl->unit; + bitmap_info->max_extent_size = *bytes; return -1; } @@ -1813,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, if (entry->bitmap) { u64 size = *bytes; - ret = search_bitmap(ctl, entry, &tmp, &size); + ret = search_bitmap(ctl, entry, &tmp, &size, true); if (!ret) { *offset = tmp; *bytes = size; @@ -1874,7 +1890,8 @@ again: search_start = *offset; search_bytes = ctl->unit; search_bytes = min(search_bytes, end - search_start + 1); - ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); + ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes, + false); if (ret < 0 || search_start != *offset) return -EINVAL; @@ -1919,7 +1936,7 @@ again: search_start = *offset; search_bytes = ctl->unit; ret = search_bitmap(ctl, bitmap_info, &search_start, - &search_bytes); + &search_bytes, false); if (ret < 0 || search_start != *offset) return -EAGAIN; @@ -1943,6 +1960,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, bitmap_set_bits(ctl, info, offset, bytes_to_set); + /* + * We set some bytes, we have no idea what the max extent size is + * anymore. + */ + info->max_extent_size = 0; + return bytes_to_set; } @@ -1951,12 +1974,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_block_group_cache *block_group = ctl->private; + bool forced = false; + +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root, + block_group)) + forced = true; +#endif /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap */ - if (ctl->free_extents < ctl->extents_thresh) { + if (!forced && ctl->free_extents < ctl->extents_thresh) { /* * If this block group has some small extents we don't want to * use up all of our free slots in the cache with them, we want @@ -2661,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, search_start = min_start; search_bytes = bytes; - err = search_bitmap(ctl, entry, &search_start, &search_bytes); + err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); if (err) { if (search_bytes > *max_extent_size) *max_extent_size = search_bytes; @@ -2775,6 +2805,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, unsigned long want_bits; unsigned long min_bits; unsigned long found_bits; + unsigned long max_bits = 0; unsigned long start = 0; unsigned long total_found = 0; int ret; @@ -2784,6 +2815,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, want_bits = bytes_to_bits(bytes, ctl->unit); min_bits = bytes_to_bits(min_bytes, ctl->unit); + /* + * Don't bother looking for a cluster in this bitmap if it's heavily + * fragmented. + */ + if (entry->max_extent_size && + entry->max_extent_size < cont1_bytes) + return -ENOSPC; again: found_bits = 0; for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { @@ -2791,13 +2829,19 @@ again: BITS_PER_BITMAP, i); if (next_zero - i >= min_bits) { found_bits = next_zero - i; + if (found_bits > max_bits) + max_bits = found_bits; break; } + if (next_zero - i > max_bits) + max_bits = next_zero - i; i = next_zero; } - if (!found_bits) + if (!found_bits) { + entry->max_extent_size = (u64)max_bits * ctl->unit; return -ENOSPC; + } if (!total_found) { start = i; @@ -3056,6 +3100,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root = RB_ROOT; cluster->max_size = 0; + cluster->fragmented = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } @@ -3223,7 +3268,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, } bytes = minlen; - ret2 = search_bitmap(ctl, entry, &start, &bytes); + ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3376,7 +3421,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root) u64 count = 1; int ret; - ret = search_bitmap(ctl, entry, &offset, &count); + ret = search_bitmap(ctl, entry, &offset, &count, true); /* Logic error; Should be empty if it can't find anything */ ASSERT(!ret); @@ -3532,6 +3577,7 @@ again: spin_lock(&ctl->tree_lock); info->offset = offset; info->bytes = bytes; + info->max_extent_size = 0; ret = link_free_space(ctl, info); spin_unlock(&ctl->tree_lock); if (ret) @@ -3559,6 +3605,7 @@ again: } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes -= bytes_added; offset += bytes_added; spin_unlock(&ctl->tree_lock); @@ -3602,7 +3649,7 @@ have_info: bit_off = offset; bit_bytes = ctl->unit; - ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); + ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false); if (!ret) { if (bit_off == offset) { ret = 1; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index a16a029ad3b1..f251865eb6f3 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -23,6 +23,7 @@ struct btrfs_free_space { struct rb_node offset_index; u64 offset; u64 bytes; + u64 max_extent_size; unsigned long *bitmap; struct list_head list; }; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 265e03c73f4d..be4d22a5022f 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, */ if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) { - btrfs_std_error(root->fs_info, -ENOENT); + btrfs_std_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; goto out; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d4a582ac3f73..767a6056ac45 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -488,17 +488,17 @@ again: /* Just to make sure we have enough space */ prealloc += 8 * PAGE_CACHE_SIZE; - ret = btrfs_delalloc_reserve_space(inode, prealloc); + ret = btrfs_delalloc_reserve_space(inode, 0, prealloc); if (ret) goto out_put; ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_space(inode, prealloc); + btrfs_delalloc_release_space(inode, 0, prealloc); goto out_put; } - btrfs_free_reserved_data_space(inode, prealloc); + btrfs_free_reserved_data_space(inode, 0, prealloc); ret = btrfs_write_out_ino_cache(root, trans, path, inode); out_put: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 611b66d73e80..4439fbb4ff45 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); out: + /* + * Don't forget to free the reserved space, as for inlined extent + * it won't count as data extent, free them directly here. + * And at reserve time, it's always aligned to page size, so + * just free one page here. + */ + btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE); btrfs_free_path(path); btrfs_end_transaction(trans, root); return ret; @@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> PAGE_CACHE_SHIFT; + /* + * atomic_sub_return implies a barrier for waitqueue_active + */ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 5 * 1024 * 1024 && waitqueue_active(&root->fs_info->async_submit_wait)) @@ -1766,7 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && do_list && !(state->state & EXTENT_NORESERVE)) - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space_noquota(inode, + state->start, len); __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, root->fs_info->delalloc_batch); @@ -1861,15 +1872,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; + enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; int ret = 0; int skip_sum; - int metadata = 0; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; if (btrfs_is_free_space_inode(inode)) - metadata = 2; + metadata = BTRFS_WQ_ENDIO_FREE_SPACE; if (!(rw & REQ_WRITE)) { ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); @@ -1989,7 +2000,8 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, page_start, + PAGE_CACHE_SIZE); if (ret) { mapping_set_error(page->mapping, ret); end_extent_writepage(page, ret, page_start, page_end); @@ -2115,7 +2127,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_alloc_reserved_file_extent(trans, root, root->root_key.objectid, - btrfs_ino(inode), file_pos, &ins); + btrfs_ino(inode), file_pos, + ram_bytes, &ins); + /* + * Release the reserved range from inode dirty range map, as it is + * already moved into delayed_ref_head + */ + btrfs_qgroup_release_data(inode, file_pos, ram_bytes); out: btrfs_free_path(path); @@ -2573,7 +2591,7 @@ again: ret = btrfs_inc_extent_ref(trans, root, new->bytenr, new->disk_len, 0, backref->root_id, backref->inum, - new->file_pos, 0); /* start - extent_offset */ + new->file_pos); /* start - extent_offset */ if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_free_path; @@ -2599,7 +2617,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) return; list_for_each_entry_safe(old, tmp, &new->head, list) { - list_del(&old->list); kfree(old); } kfree(new); @@ -2824,6 +2841,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ + + /* + * For mwrite(mmap + memset to write) case, we still reserve + * space for NOCOW range. + * As NOCOW won't cause a new delayed ref, just free the space + */ + btrfs_qgroup_free_data(inode, ordered_extent->file_offset, + ordered_extent->len); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (nolock) trans = btrfs_join_transaction_nolock(root); @@ -3018,8 +3043,6 @@ static int __readpage_endio_check(struct inode *inode, char *kaddr; u32 csum_expected; u32 csum = ~(u32)0; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); csum_expected = *(((u32 *)io_bio->csum) + icsum); @@ -3032,9 +3055,8 @@ static int __readpage_endio_check(struct inode *inode, kunmap_atomic(kaddr); return 0; zeroit: - if (__ratelimit(&_rs)) - btrfs_warn(BTRFS_I(inode)->root->fs_info, - "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_warn_rl(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu off %llu csum %u expected csum %u", btrfs_ino(inode), start, csum, csum_expected); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); @@ -4217,6 +4239,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, } +static int truncate_inline_extent(struct inode *inode, + struct btrfs_path *path, + struct btrfs_key *found_key, + const u64 item_end, + const u64 new_size) +{ + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *fi; + u32 size = (u32)(new_size - found_key->offset); + struct btrfs_root *root = BTRFS_I(inode)->root; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { + loff_t offset = new_size; + loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); + + /* + * Zero out the remaining of the last page of our inline extent, + * instead of directly truncating our inline extent here - that + * would be much more complex (decompressing all the data, then + * compressing the truncated data, which might be bigger than + * the size of the inline extent, resize the extent, etc). + * We release the path because to get the page we might need to + * read the extent item from disk (data not in the page cache). + */ + btrfs_release_path(path); + return btrfs_truncate_page(inode, offset, page_end - offset, 0); + } + + btrfs_set_file_extent_ram_bytes(leaf, fi, size); + size = btrfs_file_extent_calc_inline_size(size); + btrfs_truncate_item(root, path, size, 1); + + if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + inode_sub_bytes(inode, item_end + 1 - new_size); + + return 0; +} + /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -4411,27 +4474,40 @@ search_again: * special encodings */ if (!del_item && - btrfs_file_extent_compression(leaf, fi) == 0 && btrfs_file_extent_encryption(leaf, fi) == 0 && btrfs_file_extent_other_encoding(leaf, fi) == 0) { - u32 size = new_size - found_key.offset; - - if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) - inode_sub_bytes(inode, item_end + 1 - - new_size); /* - * update the ram bytes to properly reflect - * the new size of our item + * Need to release path in order to truncate a + * compressed extent. So delete any accumulated + * extent items so far. */ - btrfs_set_file_extent_ram_bytes(leaf, fi, size); - size = - btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root, path, size, 1); + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE && pending_del_nr) { + err = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + if (err) { + btrfs_abort_transaction(trans, + root, + err); + goto error; + } + pending_del_nr = 0; + } + + err = truncate_inline_extent(inode, path, + &found_key, + item_end, + new_size); + if (err) { + btrfs_abort_transaction(trans, + root, err); + goto error; + } } else if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) { - inode_sub_bytes(inode, item_end + 1 - - found_key.offset); + inode_sub_bytes(inode, item_end + 1 - new_size); } } delete: @@ -4461,7 +4537,7 @@ delete: ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), - ino, extent_offset, 0); + ino, extent_offset); BUG_ON(ret); if (btrfs_should_throttle_delayed_refs(trans, root)) btrfs_async_run_delayed_refs(root, @@ -4575,14 +4651,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len, if ((offset & (blocksize - 1)) == 0 && (!len || ((len & (blocksize - 1)) == 0))) goto out; - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, + round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE); if (ret) goto out; again: page = find_or_create_page(mapping, index, mask); if (!page) { - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, + round_down(from, PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE); ret = -ENOMEM; goto out; } @@ -4650,7 +4729,8 @@ again: out_unlock: if (ret) - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, + PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -5048,6 +5128,18 @@ static void evict_inode_truncate_pages(struct inode *inode) spin_unlock(&io_tree->lock); lock_extent_bits(io_tree, start, end, 0, &cached_state); + + /* + * If still has DELALLOC flag, the extent didn't reach disk, + * and its reserved space won't be freed by delayed_ref. + * So we need to free its reserved space here. + * (Refer to comment in btrfs_invalidatepage, case 2) + * + * Note, end is the bytenr of last byte, so we need + 1 here. + */ + if (state->state & EXTENT_DELALLOC) + btrfs_qgroup_free_data(inode, start, end - start + 1); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | @@ -7581,7 +7673,7 @@ unlock: spin_unlock(&BTRFS_I(inode)->lock); } - btrfs_free_reserved_data_space(inode, len); + btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; current->journal_info = dio_data; @@ -8371,7 +8463,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, mutex_unlock(&inode->i_mutex); relock = true; } - ret = btrfs_delalloc_reserve_space(inode, count); + ret = btrfs_delalloc_reserve_space(inode, offset, count); if (ret) goto out; dio_data.outstanding_extents = div64_u64(count + @@ -8400,10 +8492,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) - btrfs_delalloc_release_space(inode, - dio_data.reserve); + btrfs_delalloc_release_space(inode, offset, + dio_data.reserve); } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(inode, + btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); } out: @@ -8562,6 +8654,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, } } + /* + * Qgroup reserved space handler + * Page here will be either + * 1) Already written to disk + * In this case, its reserved space is released from data rsv map + * and will be freed by delayed_ref handler finally. + * So even we call qgroup_free_data(), it won't decrease reserved + * space. + * 2) Not written to disk + * This means the reserved space should be freed here. + */ + btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | @@ -8612,7 +8716,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_end; sb_start_pagefault(inode->i_sb); - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + ret = btrfs_delalloc_reserve_space(inode, page_start, + PAGE_CACHE_SIZE); if (!ret) { ret = file_update_time(vma->vm_file); reserved = 1; @@ -8631,8 +8739,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) again: lock_page(page); size = i_size_read(inode); - page_start = page_offset(page); - page_end = page_start + PAGE_CACHE_SIZE - 1; if ((page->mapping != inode->i_mapping) || (page_start >= size)) { @@ -8709,7 +8815,7 @@ out_unlock: } unlock_page(page); out: - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE); out_noreserve: sb_end_pagefault(inode->i_sb); return ret; @@ -8998,6 +9104,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } + btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); free: @@ -9634,6 +9741,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 cur_offset = start; u64 i_size; u64 cur_bytes; + u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; @@ -9650,6 +9758,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); cur_bytes = max(cur_bytes, min_size); + /* + * If we are severely fragmented we could end up with really + * small allocations, so if the allocator is returning small + * chunks lets make its job easier by only searching for those + * sized chunks. + */ + cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { @@ -9658,6 +9773,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } + last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8d20f3b1cab0..da94138eb85e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1120,7 +1120,8 @@ static int cluster_pages_for_defrag(struct inode *inode, page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); ret = btrfs_delalloc_reserve_space(inode, - page_cnt << PAGE_CACHE_SHIFT); + start_index << PAGE_CACHE_SHIFT, + page_cnt << PAGE_CACHE_SHIFT); if (ret) return ret; i_done = 0; @@ -1210,7 +1211,8 @@ again: BTRFS_I(inode)->outstanding_extents++; spin_unlock(&BTRFS_I(inode)->lock); btrfs_delalloc_release_space(inode, - (page_cnt - i_done) << PAGE_CACHE_SHIFT); + start_index << PAGE_CACHE_SHIFT, + (page_cnt - i_done) << PAGE_CACHE_SHIFT); } @@ -1235,7 +1237,9 @@ out: unlock_page(pages[i]); page_cache_release(pages[i]); } - btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); + btrfs_delalloc_release_space(inode, + start_index << PAGE_CACHE_SHIFT, + page_cnt << PAGE_CACHE_SHIFT); return ret; } @@ -1342,7 +1346,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, break; if (btrfs_defrag_cancelled(root->fs_info)) { - printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); + btrfs_debug(root->fs_info, "defrag_file cancelled"); ret = -EAGAIN; break; } @@ -1579,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = div_u64(new_size, root->sectorsize); new_size *= root->sectorsize; - printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", + btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu", rcu_str_deref(device->name), new_size); if (new_size > old_size) { @@ -2081,7 +2085,7 @@ static noinline int search_ioctl(struct inode *inode, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "BTRFS: could not find root %llu\n", + btrfs_err(info, "could not find root %llu", sk->tree_id); btrfs_free_path(path); return -ENOENT; @@ -2221,7 +2225,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; root = btrfs_read_fs_root_no_name(info, &key); if (IS_ERR(root)) { - printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); + btrfs_err(info, "could not find root %llu", tree_id); ret = -ENOENT; goto out; } @@ -2699,7 +2703,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; - struct btrfs_device *next; struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; int ret = 0; @@ -2711,7 +2714,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) fi_args->num_devices = fs_devices->num_devices; memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); - list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) fi_args->max_id = device->devid; } @@ -3203,41 +3206,6 @@ out: return ret; } -/* Helper to check and see if this root currently has a ref on the given disk - * bytenr. If it does then we need to update the quota for this root. This - * doesn't do anything if quotas aren't enabled. - */ -static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 disko) -{ - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); - struct ulist *roots; - struct ulist_iterator uiter; - struct ulist_node *root_node = NULL; - int ret; - - if (!root->fs_info->quota_enabled) - return 1; - - btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); - ret = btrfs_find_all_roots(trans, root->fs_info, disko, - tree_mod_seq_elem.seq, &roots); - if (ret < 0) - goto out; - ret = 0; - ULIST_ITER_INIT(&uiter); - while ((root_node = ulist_next(roots, &uiter))) { - if (root_node->val == root->objectid) { - ret = 1; - break; - } - } - ulist_free(roots); -out: - btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem); - return ret; -} - static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, @@ -3328,6 +3296,150 @@ static void clone_update_extent_map(struct inode *inode, &BTRFS_I(inode)->runtime_flags); } +/* + * Make sure we do not end up inserting an inline extent into a file that has + * already other (non-inline) extents. If a file has an inline extent it can + * not have any other extents and the (single) inline extent must start at the + * file offset 0. Failing to respect these rules will lead to file corruption, + * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc + * + * We can have extents that have been already written to disk or we can have + * dirty ranges still in delalloc, in which case the extent maps and items are + * created only when we run delalloc, and the delalloc ranges might fall outside + * the range we are currently locking in the inode's io tree. So we check the + * inode's i_size because of that (i_size updates are done while holding the + * i_mutex, which we are holding here). + * We also check to see if the inode has a size not greater than "datal" but has + * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are + * protected against such concurrent fallocate calls by the i_mutex). + * + * If the file has no extents but a size greater than datal, do not allow the + * copy because we would need turn the inline extent into a non-inline one (even + * with NO_HOLES enabled). If we find our destination inode only has one inline + * extent, just overwrite it with the source inline extent if its size is less + * than the source extent's size, or we could copy the source inline extent's + * data into the destination inode's inline extent if the later is greater then + * the former. + */ +static int clone_copy_inline_extent(struct inode *src, + struct inode *dst, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct btrfs_key *new_key, + const u64 drop_start, + const u64 datal, + const u64 skip, + const u64 size, + char *inline_data) +{ + struct btrfs_root *root = BTRFS_I(dst)->root; + const u64 aligned_end = ALIGN(new_key->offset + datal, + root->sectorsize); + int ret; + struct btrfs_key key; + + if (new_key->offset > 0) + return -EOPNOTSUPP; + + key.objectid = btrfs_ino(dst); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + goto copy_inline_extent; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) { + ASSERT(key.offset > 0); + return -EOPNOTSUPP; + } + } else if (i_size_read(dst) <= datal) { + struct btrfs_file_extent_item *ei; + u64 ext_len; + + /* + * If the file size is <= datal, make sure there are no other + * extents following (can happen do to an fallocate call with + * the flag FALLOC_FL_KEEP_SIZE). + */ + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + /* + * If it's an inline extent, it can not have other extents + * following it. + */ + if (btrfs_file_extent_type(path->nodes[0], ei) == + BTRFS_FILE_EXTENT_INLINE) + goto copy_inline_extent; + + ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); + if (ext_len > aligned_end) + return -EOPNOTSUPP; + + ret = btrfs_next_item(root, path); + if (ret < 0) { + return ret; + } else if (ret == 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid == btrfs_ino(dst) && + key.type == BTRFS_EXTENT_DATA_KEY) + return -EOPNOTSUPP; + } + } + +copy_inline_extent: + /* + * We have no extent items, or we have an extent at offset 0 which may + * or may not be inlined. All these cases are dealt the same way. + */ + if (i_size_read(dst) > datal) { + /* + * If the destination inode has an inline extent... + * This would require copying the data from the source inline + * extent into the beginning of the destination's inline extent. + * But this is really complex, both extents can be compressed + * or just one of them, which would require decompressing and + * re-compressing data (which could increase the new compressed + * size, not allowing the compressed data to fit anymore in an + * inline extent). + * So just don't support this case for now (it should be rare, + * we are not really saving space when cloning inline extents). + */ + return -EOPNOTSUPP; + } + + btrfs_release_path(path); + ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); + if (ret) + return ret; + ret = btrfs_insert_empty_item(trans, root, path, new_key, size); + if (ret) + return ret; + + if (skip) { + const u32 start = btrfs_file_extent_calc_inline_size(0); + + memmove(inline_data + start, inline_data + start + skip, datal); + } + + write_extent_buffer(path->nodes[0], inline_data, + btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]), + size); + inode_add_bytes(dst, datal); + + return 0; +} + /** * btrfs_clone() - clone a range from inode file to another * @@ -3352,9 +3464,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u32 nritems; int slot; int ret; - int no_quota; const u64 len = olen_aligned; - u64 last_disko = 0; u64 last_dest_end = destoff; ret = -ENOMEM; @@ -3400,7 +3510,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, nritems = btrfs_header_nritems(path->nodes[0]); process_slot: - no_quota = 1; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(BTRFS_I(src)->root, path); if (ret < 0) @@ -3552,35 +3661,13 @@ process_slot: btrfs_set_file_extent_num_bytes(leaf, extent, datal); - /* - * We need to look up the roots that point at - * this bytenr and see if the new root does. If - * it does not we need to make sure we update - * quotas appropriately. - */ - if (disko && root != BTRFS_I(src)->root && - disko != last_disko) { - no_quota = check_ref(trans, root, - disko); - if (no_quota < 0) { - btrfs_abort_transaction(trans, - root, - ret); - btrfs_end_transaction(trans, - root); - ret = no_quota; - goto out; - } - } - if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), - new_key.offset - datao, - no_quota); + new_key.offset - datao); if (ret) { btrfs_abort_transaction(trans, root, @@ -3594,21 +3681,6 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; - u64 aligned_end = 0; - - /* - * Don't copy an inline extent into an offset - * greater than zero. Having an inline extent - * at such an offset results in chaos as btrfs - * isn't prepared for such cases. Just skip - * this case for the same reasons as commented - * at btrfs_ioctl_clone(). - */ - if (last_dest_end > 0) { - ret = -EOPNOTSUPP; - btrfs_end_transaction(trans, root); - goto out; - } if (off > key.offset) { skip = off - key.offset; @@ -3626,42 +3698,22 @@ process_slot: size -= skip + trim; datal -= skip + trim; - aligned_end = ALIGN(new_key.offset + datal, - root->sectorsize); - ret = btrfs_drop_extents(trans, root, inode, - drop_start, - aligned_end, - 1); + ret = clone_copy_inline_extent(src, inode, + trans, path, + &new_key, + drop_start, + datal, + skip, size, buf); if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, - root, ret); - btrfs_end_transaction(trans, root); - goto out; - } - - ret = btrfs_insert_empty_item(trans, root, path, - &new_key, size); - if (ret) { - btrfs_abort_transaction(trans, root, - ret); + root, + ret); btrfs_end_transaction(trans, root); goto out; } - - if (skip) { - u32 start = - btrfs_file_extent_calc_inline_size(0); - memmove(buf+start, buf+start+skip, - datal); - } - leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, buf, - btrfs_item_ptr_offset(leaf, slot), - size); - inode_add_bytes(inode, datal); } /* If we have an implicit hole (NO_HOLES feature). */ @@ -4814,7 +4866,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* update qgroup status and info */ err = btrfs_run_qgroups(trans, root->fs_info); if (err < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index d7e6baf1b205..8077461fc56a 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) write_lock(&eb->lock); WARN_ON(atomic_read(&eb->spinning_writers)); atomic_inc(&eb->spinning_writers); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_writers) && waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); @@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) BUG_ON(atomic_read(&eb->blocking_readers) == 0); read_lock(&eb->lock); atomic_inc(&eb->spinning_readers); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_readers) && waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); @@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) } btrfs_assert_tree_read_locked(eb); WARN_ON(atomic_read(&eb->blocking_readers) == 0); + /* + * atomic_dec_and_test implies a barrier for waitqueue_active + */ if (atomic_dec_and_test(&eb->blocking_readers) && waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); @@ -280,6 +289,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb) if (blockers) { WARN_ON(atomic_read(&eb->spinning_writers)); atomic_dec(&eb->blocking_writers); + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); if (waitqueue_active(&eb->write_lock_wq)) wake_up(&eb->write_lock_wq); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 52170cf1757e..8c27292ea9ea 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Implicit memory barrier after test_and_set_bit + */ if (waitqueue_active(&entry->wait)) wake_up(&entry->wait); } else { @@ -409,6 +412,9 @@ have_entry: if (entry->bytes_left == 0) { ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + /* + * Implicit memory barrier after test_and_set_bit + */ if (waitqueue_active(&entry->wait)) wake_up(&entry->wait); } else { @@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, spin_lock_irq(&log->log_extents_lock[index]); while (!list_empty(&log->logged_list[index])) { + struct inode *inode; ordered = list_first_entry(&log->logged_list[index], struct btrfs_ordered_extent, log_list); list_del_init(&ordered->log_list); + inode = ordered->inode; spin_unlock_irq(&log->log_extents_lock[index]); if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { - struct inode *inode = ordered->inode; u64 start = ordered->file_offset; u64 end = ordered->file_offset + ordered->len - 1; @@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, &ordered->flags)); /* - * If our ordered extent completed it means it updated the - * fs/subvol and csum trees already, so no need to make the - * current transaction's commit wait for it, as we end up - * holding memory unnecessarily and delaying the inode's iput - * until the transaction commit (we schedule an iput for the - * inode when the ordered extent's refcount drops to 0), which - * prevents it from being evictable until the transaction - * commits. + * In order to keep us from losing our ordered extent + * information when committing the transaction we have to make + * sure that any logged extents are completed when we go to + * commit the transaction. To do this we simply increase the + * current transactions pending_ordered counter and decrement it + * when the ordered extent completes. */ - if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) - btrfs_put_ordered_extent(ordered); - else - list_add_tail(&ordered->trans_list, &trans->ordered); - + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock_irq(&tree->lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&tree->lock); + } + btrfs_put_ordered_extent(ordered); spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; + bool dec_pending_ordered = false; tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); @@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags)) + dec_pending_ordered = true; spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (dec_pending_ordered) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&root->fs_info->trans_lock); + trans = root->fs_info->running_transaction; + if (trans) + atomic_inc(&trans->use_count); + spin_unlock(&root->fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 7176cc0fe43f..23c96059cef2 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -73,6 +73,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent * in the logging code. */ +#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to + * complete in the current transaction. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index dca137b04095..f9e60231f685 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = { .extract = prop_compression_extract, .inheritable = 1 }, - { - .xattr_name = NULL - } }; void __init btrfs_props_init(void) { - struct prop_handler *p; + int i; hash_init(prop_handlers_ht); - for (p = &prop_handlers[0]; p->xattr_name; p++) { + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + struct prop_handler *p = &prop_handlers[i]; u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); hash_add(prop_handlers_ht, &p->node, h); @@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *parent) { - const struct prop_handler *h; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; + int i; if (!test_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(parent)->runtime_flags)) return 0; - for (h = &prop_handlers[0]; h->xattr_name; h++) { + for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { + const struct prop_handler *h = &prop_handlers[i]; const char *value; u64 num_bytes; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index d904ee1c5349..46476c226395 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1652,10 +1652,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, } } - /* For exclusive extent, free its reserved bytes too */ - if (nr_old_roots == 0 && nr_new_roots == 1 && - cur_new_count == nr_new_roots) - qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } @@ -2035,7 +2031,7 @@ out: return ret; } -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) +static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -2116,14 +2112,13 @@ out: return ret; } -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; - struct btrfs_fs_info *fs_info = root->fs_info; struct ulist_node *unode; struct ulist_iterator uiter; - u64 ref_root = root->root_key.objectid; int ret = 0; if (!is_fstree(ref_root)) @@ -2169,6 +2164,11 @@ out: spin_unlock(&fs_info->qgroup_lock); } +static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes) +{ + return btrfs_qgroup_free_refroot(root->fs_info, root->objectid, + num_bytes); +} void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) { if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) @@ -2188,10 +2188,10 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, - struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans) { struct btrfs_key found; + struct extent_buffer *scratch_leaf = NULL; struct ulist *roots = NULL; struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; @@ -2229,7 +2229,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); - memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); + scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); + if (!scratch_leaf) { + ret = -ENOMEM; + mutex_unlock(&fs_info->qgroup_rescan_lock); + goto out; + } + extent_buffer_get(scratch_leaf); + btrfs_tree_read_lock(scratch_leaf); + btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK); slot = path->slots[0]; btrfs_release_path(path); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -2255,6 +2263,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, goto out; } out: + if (scratch_leaf) { + btrfs_tree_read_unlock_blocking(scratch_leaf); + free_extent_buffer(scratch_leaf); + } btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); return ret; @@ -2266,16 +2278,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; int ret = 0; path = btrfs_alloc_path(); if (!path) goto out; - scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); - if (!scratch_leaf) - goto out; err = 0; while (!err) { @@ -2287,8 +2295,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) if (!fs_info->quota_enabled) { err = -EINTR; } else { - err = qgroup_rescan_leaf(fs_info, path, trans, - scratch_leaf); + err = qgroup_rescan_leaf(fs_info, path, trans); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2297,7 +2304,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) } out: - kfree(scratch_leaf); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); @@ -2486,3 +2492,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) btrfs_queue_work(fs_info->qgroup_rescan_workers, &fs_info->qgroup_rescan_work); } + +/* + * Reserve qgroup space for range [start, start + len). + * + * This function will either reserve space from related qgroups or doing + * nothing if the range is already reserved. + * + * Return 0 for successful reserve + * Return <0 for error (including -EQUOT) + * + * NOTE: this function may sleep for memory allocation. + */ +int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) || + len == 0) + return 0; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, + &changeset); + trace_btrfs_qgroup_reserve_data(inode, start, len, + changeset.bytes_changed, + QGROUP_RESERVE); + if (ret < 0) + goto cleanup; + ret = qgroup_reserve(root, changeset.bytes_changed); + if (ret < 0) + goto cleanup; + + ulist_free(changeset.range_changed); + return ret; + +cleanup: + /* cleanup already reserved ranges */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(changeset.range_changed, &uiter))) + clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL, + GFP_NOFS); + ulist_free(changeset.range_changed); + return ret; +} + +static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, + int free) +{ + struct extent_changeset changeset; + int trace_op = QGROUP_RELEASE; + int ret; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + if (!changeset.range_changed) + return -ENOMEM; + + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, + start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS, + &changeset); + if (ret < 0) + goto out; + + if (free) { + qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + trace_op = QGROUP_FREE; + } + trace_btrfs_qgroup_release_data(inode, start, len, + changeset.bytes_changed, trace_op); +out: + ulist_free(changeset.range_changed); + return ret; +} + +/* + * Free a reserved space range from io_tree and related qgroups + * + * Should be called when a range of pages get invalidated before reaching disk. + * Or for error cleanup case. + * + * For data written to disk, use btrfs_qgroup_release_data(). + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len) +{ + return __btrfs_qgroup_release_data(inode, start, len, 1); +} + +/* + * Release a reserved space range from io_tree only. + * + * Should be called when a range of pages get written to disk and corresponding + * FILE_EXTENT is inserted into corresponding root. + * + * Since new qgroup accounting framework will only update qgroup numbers at + * commit_transaction() time, its reserved space shouldn't be freed from + * related qgroups. + * + * But we should release the range from io_tree, to allow further write to be + * COWed. + * + * NOTE: This function may sleep for memory allocation. + */ +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) +{ + return __btrfs_qgroup_release_data(inode, start, len, 0); +} + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes) +{ + int ret; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) || + num_bytes == 0) + return 0; + + BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + ret = qgroup_reserve(root, num_bytes); + if (ret < 0) + return ret; + atomic_add(num_bytes, &root->qgroup_meta_rsv); + return ret; +} + +void btrfs_qgroup_free_meta_all(struct btrfs_root *root) +{ + int reserved; + + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid)) + return; + + reserved = atomic_xchg(&root->qgroup_meta_rsv, 0); + if (reserved == 0) + return; + qgroup_free(root, reserved); +} + +void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) +{ + if (!root->fs_info->quota_enabled || !is_fstree(root->objectid)) + return; + + BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes); + atomic_sub(num_bytes, &root->qgroup_meta_rsv); + qgroup_free(root, num_bytes); +} + +/* + * Check qgroup reserved space leaking, normally at destory inode + * time + */ +void btrfs_qgroup_check_reserved_leak(struct inode *inode) +{ + struct extent_changeset changeset; + struct ulist_node *unode; + struct ulist_iterator iter; + int ret; + + changeset.bytes_changed = 0; + changeset.range_changed = ulist_alloc(GFP_NOFS); + if (WARN_ON(!changeset.range_changed)) + return; + + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset); + + WARN_ON(ret < 0); + if (WARN_ON(changeset.bytes_changed)) { + ULIST_ITER_INIT(&iter); + while ((unode = ulist_next(changeset.range_changed, &iter))) { + btrfs_warn(BTRFS_I(inode)->root->fs_info, + "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu", + inode->i_ino, unode->val, unode->aux); + } + qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed); + } + ulist_free(changeset.range_changed); +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 6387dcfa354c..ecb2c143ef75 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record { struct ulist *old_roots; }; +/* + * For qgroup event trace points only + */ +#define QGROUP_RESERVE (1<<0) +#define QGROUP_RELEASE (1<<1) +#define QGROUP_FREE (1<<2) + int btrfs_quota_enable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_trans_handle *trans, @@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, struct btrfs_qgroup_inherit *inherit); -int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); -void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); - +void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes); +/* + * TODO: Add proper trace point for it, as btrfs_qgroup_free() is + * called by everywhere, can't provide good trace for delayed ref case. + */ +static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) +{ + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); + trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes); +} void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, u64 rfer, u64 excl); #endif +/* New io_tree based accurate qgroup reserve API */ +int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); +int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len); + +int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes); +void btrfs_qgroup_free_meta_all(struct btrfs_root *root); +void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes); +void btrfs_qgroup_check_reserved_leak(struct inode *inode); #endif /* __BTRFS_QGROUP__ */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fcf7265ca46f..1a33d3eb36de 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -810,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } goto done_nolock; - } else if (waitqueue_active(&h->wait)) { + /* + * The barrier for this waitqueue_active is not needed, + * we're protected by h->lock and can't miss a wakeup. + */ + } else if (waitqueue_active(&h->wait)) { spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); wake_up(&h->wait); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 4645cd16d5ba..619f92963e27 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical, rec = kzalloc(sizeof(*rec), GFP_NOFS); if (!rec) { reada_extent_put(root->fs_info, re); - return -1; + return -ENOMEM; } rec->rc = rc; @@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, u64 start; u64 generation; int level; + int ret; struct extent_buffer *node; static struct btrfs_key max_key = { .objectid = (u64)-1, @@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, generation = btrfs_header_generation(node); free_extent_buffer(node); - if (reada_add_block(rc, start, &max_key, level, generation)) { + ret = reada_add_block(rc, start, &max_key, level, generation); + if (ret) { kfree(rc); - return ERR_PTR(-ENOMEM); + return ERR_PTR(ret); } reada_start_machine(root->fs_info); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 303babeef505..b4ca5454ef1a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); if (ret) { btrfs_abort_transaction(trans, root, ret); break; @@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, ret = btrfs_free_extent(trans, root, bytenr, num_bytes, parent, btrfs_header_owner(leaf), - key.objectid, key.offset, 1); + key.objectid, key.offset); if (ret) { btrfs_abort_transaction(trans, root, ret); break; @@ -1900,23 +1900,21 @@ again: ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, path->nodes[level]->start, - src->root_key.objectid, level - 1, 0, - 1); + src->root_key.objectid, level - 1, 0); BUG_ON(ret); ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, - 0, 1); + 0); BUG_ON(ret); btrfs_unlock_up_safe(path, 0); @@ -2418,7 +2416,7 @@ again: } out: if (ret) { - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); @@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), - node->level, 0, 1); + node->level, 0); BUG_ON(ret); ret = btrfs_drop_subtree(trans, root, eb, upper->eb); @@ -3034,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode, BUG_ON(cluster->start != cluster->boundary[0]); mutex_lock(&inode->i_mutex); - ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start, 0); + ret = btrfs_check_data_free_space(inode, cluster->start, + cluster->end + 1 - cluster->start); if (ret) goto out; @@ -3056,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode, break; nr++; } - btrfs_free_reserved_data_space(inode, cluster->end + - 1 - cluster->start); + btrfs_free_reserved_data_space(inode, cluster->start, + cluster->end + 1 - cluster->start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 360a728a639f..7cf8509deda7 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, if (!need_reset && btrfs_root_generation(item) != btrfs_root_generation_v2(item)) { if (btrfs_root_generation_v2(item) != 0) { - printk(KERN_WARNING "BTRFS: mismatching " + btrfs_warn(eb->fs_info, + "mismatching " "generation and generation_v2 " "found in root item. This root " "was probably mounted with an " "older kernel. Resetting all " - "new fields.\n"); + "new fields."); } need_reset = 1; } @@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root int ret; int slot; unsigned long ptr; - int old_len; + u32 old_len; path = btrfs_alloc_path(); if (!path) @@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_error(tree_root->fs_info, err, + btrfs_std_error(tree_root->fs_info, err, "Failed to start trans to delete " "orphan item"); break; @@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) root_key.objectid); btrfs_end_transaction(trans, tree_root); if (err) { - btrfs_error(tree_root->fs_info, err, + btrfs_std_error(tree_root->fs_info, err, "Failed to delete root orphan " "item"); break; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a39f5d1144e8..550de89a8661 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -580,9 +580,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, * hold all of the paths here */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) - printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu, " - "length %llu, links %u (path: %s)\n", swarn->errstr, + "length %llu, links %u (path: %s)", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, @@ -592,9 +592,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, return 0; err: - printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " + btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev " "%s, sector %llu, root %llu, inode %llu, offset %llu: path " - "resolving failed with ret=%d\n", swarn->errstr, + "resolving failed with ret=%d", swarn->errstr, swarn->logical, rcu_str_deref(swarn->dev->name), (unsigned long long)swarn->sector, root, inum, offset, ret); @@ -649,10 +649,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); - printk_in_rcu(KERN_WARNING - "BTRFS: %s at logical %llu on dev %s, " + btrfs_warn_in_rcu(fs_info, + "%s at logical %llu on dev %s, " "sector %llu: metadata %s (level %d) in tree " - "%llu\n", errstr, swarn.logical, + "%llu", errstr, swarn.logical, rcu_str_deref(dev->name), (unsigned long long)swarn.sector, ref_level ? "node" : "leaf", @@ -850,8 +850,8 @@ out: btrfs_dev_replace_stats_inc( &sctx->dev_root->fs_info->dev_replace. num_uncorrectable_read_errors); - printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " - "unable to fixup (nodatasum) error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + "unable to fixup (nodatasum) error at logical %llu on dev %s", fixup->logical, rcu_str_deref(fixup->dev->name)); } @@ -1230,8 +1230,8 @@ corrected_error: sctx->stat.corrected_errors++; sblock_to_check->data_corrected = 1; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: fixed up error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "fixed up error at logical %llu on dev %s", logical, rcu_str_deref(dev->name)); } } else { @@ -1239,8 +1239,8 @@ did_not_correct_error: spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "unable to fixup (regular) error at logical %llu on dev %s", logical, rcu_str_deref(dev->name)); } @@ -1626,9 +1626,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; if (!page_bad->dev->bdev) { - printk_ratelimited(KERN_WARNING "BTRFS: " + btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) " - "is unexpected!\n"); + "is unexpected"); return -EIO; } @@ -2201,15 +2201,15 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: I/O error rebulding logical %llu for dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "IO error rebuilding logical %llu for dev %s", logical, rcu_str_deref(dev->name)); } else if (sblock->header_error || sblock->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - printk_ratelimited_in_rcu(KERN_ERR - "BTRFS: failed to rebuild valid logical %llu for dev %s\n", + btrfs_err_rl_in_rcu(fs_info, + "failed to rebuild valid logical %llu for dev %s", logical, rcu_str_deref(dev->name)); } else { scrub_write_block_to_dev_replace(sblock); @@ -4375,8 +4375,8 @@ static int write_page_nocow(struct scrub_ctx *sctx, if (!dev) return -EIO; if (!dev->bdev) { - printk_ratelimited(KERN_WARNING - "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); + btrfs_warn_rl(dev->dev_root->fs_info, + "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } bio = btrfs_io_bio_alloc(GFP_NOFS, 1); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a739b825bdd3..355a458cba1a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, " } if (cur_clone_root) { - if (compressed != BTRFS_COMPRESS_NONE) { - /* - * Offsets given by iterate_extent_inodes() are relative - * to the start of the extent, we need to add logical - * offset from the file extent item. - * (See why at backref.c:check_extent_in_eb()) - */ - cur_clone_root->offset += btrfs_file_extent_offset(eb, - fi); - } *found = cur_clone_root; ret = 0; } else { @@ -2353,8 +2343,14 @@ static int send_subvol_begin(struct send_ctx *sctx) } TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); - TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, - sctx->send_root->root_item.uuid); + + if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, + sctx->send_root->root_item.uuid); + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { @@ -2564,7 +2560,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino); } else if (S_ISSOCK(mode)) { cmd = BTRFS_SEND_C_MKSOCK; } else { - printk(KERN_WARNING "btrfs: unexpected inode type %o", + btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o", (int)(mode & S_IFMT)); ret = -ENOTSUPP; goto out; @@ -4687,6 +4683,171 @@ tlv_put_failure: return ret; } +static int send_extent_data(struct send_ctx *sctx, + const u64 offset, + const u64 len) +{ + u64 sent = 0; + + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, len); + + while (sent < len) { + u64 size = len - sent; + int ret; + + if (size > BTRFS_SEND_READ_SIZE) + size = BTRFS_SEND_READ_SIZE; + ret = send_write(sctx, offset + sent, size); + if (ret < 0) + return ret; + if (!ret) + break; + sent += ret; + } + return 0; +} + +static int clone_range(struct send_ctx *sctx, + struct clone_root *clone_root, + const u64 disk_byte, + u64 data_offset, + u64 offset, + u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + path = alloc_path_for_send(); + if (!path) + return -ENOMEM; + + /* + * We can't send a clone operation for the entire range if we find + * extent items in the respective range in the source file that + * refer to different extents or if we find holes. + * So check for that and do a mix of clone and regular write/copy + * operations if needed. + * + * Example: + * + * mkfs.btrfs -f /dev/sda + * mount /dev/sda /mnt + * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo + * cp --reflink=always /mnt/foo /mnt/bar + * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo + * btrfs subvolume snapshot -r /mnt /mnt/snap + * + * If when we send the snapshot and we are processing file bar (which + * has a higher inode number than foo) we blindly send a clone operation + * for the [0, 100K[ range from foo to bar, the receiver ends up getting + * a file bar that matches the content of file foo - iow, doesn't match + * the content from bar in the original filesystem. + */ + key.objectid = clone_root->ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = clone_root->offset; + ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0 && path->slots[0] > 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); + if (key.objectid == clone_root->ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + struct btrfs_file_extent_item *ei; + u8 type; + u64 ext_len; + u64 clone_len; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(clone_root->root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + /* + * We might have an implicit trailing hole (NO_HOLES feature + * enabled). We deal with it after leaving this loop. + */ + if (key.objectid != clone_root->ino || + key.type != BTRFS_EXTENT_DATA_KEY) + break; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + type = btrfs_file_extent_type(leaf, ei); + if (type == BTRFS_FILE_EXTENT_INLINE) { + ext_len = btrfs_file_extent_inline_len(leaf, slot, ei); + ext_len = PAGE_CACHE_ALIGN(ext_len); + } else { + ext_len = btrfs_file_extent_num_bytes(leaf, ei); + } + + if (key.offset + ext_len <= clone_root->offset) + goto next; + + if (key.offset > clone_root->offset) { + /* Implicit hole, NO_HOLES feature enabled. */ + u64 hole_len = key.offset - clone_root->offset; + + if (hole_len > len) + hole_len = len; + ret = send_extent_data(sctx, offset, hole_len); + if (ret < 0) + goto out; + + len -= hole_len; + if (len == 0) + break; + offset += hole_len; + clone_root->offset += hole_len; + data_offset += hole_len; + } + + if (key.offset >= clone_root->offset + len) + break; + + clone_len = min_t(u64, ext_len, len); + + if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte && + btrfs_file_extent_offset(leaf, ei) == data_offset) + ret = send_clone(sctx, offset, clone_len, clone_root); + else + ret = send_extent_data(sctx, offset, clone_len); + + if (ret < 0) + goto out; + + len -= clone_len; + if (len == 0) + break; + offset += clone_len; + clone_root->offset += clone_len; + data_offset += clone_len; +next: + path->slots[0]++; + } + + if (len > 0) + ret = send_extent_data(sctx, offset, len); + else + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_path *path, struct btrfs_key *key, @@ -4695,9 +4856,7 @@ static int send_write_or_clone(struct send_ctx *sctx, int ret = 0; struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 pos = 0; u64 len; - u32 l; u8 type; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; @@ -4725,22 +4884,15 @@ static int send_write_or_clone(struct send_ctx *sctx, } if (clone_root && IS_ALIGNED(offset + len, bs)) { - ret = send_clone(sctx, offset, len, clone_root); - } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { - ret = send_update_extent(sctx, offset, len); + u64 disk_byte; + u64 data_offset; + + disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); + data_offset = btrfs_file_extent_offset(path->nodes[0], ei); + ret = clone_range(sctx, clone_root, disk_byte, data_offset, + offset, len); } else { - while (pos < len) { - l = len - pos; - if (l > BTRFS_SEND_READ_SIZE) - l = BTRFS_SEND_READ_SIZE; - ret = send_write(sctx, pos + offset, l); - if (ret < 0) - goto out; - if (!ret) - break; - pos += ret; - } - ret = 0; + ret = send_extent_data(sctx, offset, len); } out: return ret; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 11d1eab9234d..24154e422945 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) } } -#ifdef CONFIG_PRINTK /* * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. @@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK const char *errstr; +#endif /* * Special case: if the error is EROFS, and we're already @@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) return; +#ifdef CONFIG_PRINTK errstr = btrfs_decode_error(errno); if (fmt) { struct va_format vaf; @@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", sb->s_id, function, line, errno, errstr); } +#endif /* Don't go through full error handling during mount */ save_error_info(fs_info); @@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, btrfs_handle_error(fs_info); } +#ifdef CONFIG_PRINTK static const char * const logtypes[] = { "emergency", "alert", @@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) va_end(args); } - -#else - -void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; - - /* - * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. - */ - if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) - return; - - /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) { - save_error_info(fs_info); - btrfs_handle_error(fs_info); - } -} #endif /* @@ -320,6 +303,9 @@ enum { Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, Opt_datasum, Opt_treelog, Opt_noinode_cache, +#ifdef CONFIG_BTRFS_DEBUG + Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, +#endif Opt_err, }; @@ -372,6 +358,11 @@ static match_table_t tokens = { {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, {Opt_fatal_errors, "fatal_errors=%s"}, {Opt_commit_interval, "commit=%d"}, +#ifdef CONFIG_BTRFS_DEBUG + {Opt_fragment_data, "fragment=data"}, + {Opt_fragment_metadata, "fragment=metadata"}, + {Opt_fragment_all, "fragment=all"}, +#endif {Opt_err, NULL}, }; @@ -738,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; } break; +#ifdef CONFIG_BTRFS_DEBUG + case Opt_fragment_all: + btrfs_info(root->fs_info, "fragmenting all space"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); + break; + case Opt_fragment_metadata: + btrfs_info(root->fs_info, "fragmenting metadata"); + btrfs_set_opt(info->mount_opt, + FRAGMENT_METADATA); + break; + case Opt_fragment_data: + btrfs_info(root->fs_info, "fragmenting data"); + btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + break; +#endif case Opt_err: btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); ret = -EINVAL; @@ -1189,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); +#ifdef CONFIG_BTRFS_DEBUG + if (btrfs_test_opt(root, FRAGMENT_DATA)) + seq_puts(seq, ",fragment=data"); + if (btrfs_test_opt(root, FRAGMENT_METADATA)) + seq_puts(seq, ",fragment=metadata"); +#endif seq_printf(seq, ",subvolid=%llu", BTRFS_I(d_inode(dentry))->root->root_key.objectid); seq_puts(seq, ",subvol="); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 603b0cc2b9bb..e0ac85949067 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = { NULL, }; -static void btrfs_release_super_kobj(struct kobject *kobj) +static void btrfs_release_fsid_kobj(struct kobject *kobj) { struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); - memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject)); complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, - .release = btrfs_release_super_kobj, + .release = btrfs_release_fsid_kobj, }; static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_devices, super_kobj); + return container_of(kobj, struct btrfs_fs_devices, fsid_kobj); } static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) @@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj, &agroup); } @@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) fs_devs->device_dir_kobj = NULL; } - if (fs_devs->super_kobj.state_initialized) { - kobject_del(&fs_devs->super_kobj); - kobject_put(&fs_devs->super_kobj); + if (fs_devs->fsid_kobj.state_initialized) { + kobject_del(&fs_devs->fsid_kobj); + kobject_put(&fs_devs->fsid_kobj); wait_for_completion(&fs_devs->kobj_unregister); } } @@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } -void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { btrfs_reset_fs_info_ptr(fs_info); @@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) kobject_put(fs_info->space_info_kobj); } addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); - sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); - btrfs_kobj_rm_device(fs_info->fs_devices, NULL); + sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -637,7 +637,7 @@ static void init_feature_attrs(void) /* when one_device is NULL, it removes all device links */ -int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; @@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { if (!fs_devs->device_dir_kobj) fs_devs->device_dir_kobj = kobject_create_and_add("devices", - &fs_devs->super_kobj); + &fs_devs->fsid_kobj); if (!fs_devs->device_dir_kobj) return -ENOMEM; @@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { int error = 0; @@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, int error; init_completion(&fs_devs->kobj_unregister); - fs_devs->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->super_kobj, + fs_devs->fsid_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, parent, "%pU", fs_devs->fsid); return error; } -int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) +int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) { int error; struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; - struct kobject *super_kobj = &fs_devs->super_kobj; + struct kobject *fsid_kobj = &fs_devs->fsid_kobj; btrfs_set_fs_info_ptr(fs_info); - error = btrfs_kobj_add_device(fs_devs, NULL); + error = btrfs_sysfs_add_device_link(fs_devs, NULL); if (error) return error; - error = sysfs_create_files(super_kobj, btrfs_attrs); + error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_kobj_rm_device(fs_devs, NULL); + btrfs_sysfs_rm_device_link(fs_devs, NULL); return error; } - error = sysfs_create_group(super_kobj, + error = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); if (error) goto failure; @@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - super_kobj); + fsid_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; @@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) return 0; failure: - btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_mounted(fs_info); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 6392527bcc15..9c09522125a6 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, +int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 2299bfde39ee..c8c3d70c31ff 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../disk-io.h" #include "../free-space-cache.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) @@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void) kfree(cache); return NULL; } + cache->fs_info = btrfs_alloc_dummy_fs_info(); + if (!cache->fs_info) { + kfree(cache->free_space_ctl); + kfree(cache); + return NULL; + } cache->key.objectid = 0; cache->key.offset = 1024 * 1024 * 1024; @@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int btrfs_test_free_space_cache(void) { struct btrfs_block_group_cache *cache; - int ret; + struct btrfs_root *root = NULL; + int ret = -ENOMEM; test_msg("Running btrfs free space cache tests\n"); @@ -889,6 +897,17 @@ int btrfs_test_free_space_cache(void) return 0; } + root = btrfs_alloc_dummy_root(); + if (!root) + goto out; + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) + goto out; + + root->fs_info->extent_root = root; + cache->fs_info = root->fs_info; + ret = test_extents(cache); if (ret) goto out; @@ -904,6 +923,7 @@ out: __btrfs_remove_free_space_cache(cache->free_space_ctl); kfree(cache->free_space_ctl); kfree(cache); + btrfs_free_dummy_root(root); test_msg("Free space cache tests finished\n"); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a5b06442f0bf..418c6a2ad7d8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) static void clear_btree_io_tree(struct extent_io_tree *tree) { spin_lock(&tree->lock); + /* + * Do a single barrier for the waitqueue_active check here, the state + * of the waitqueue should not change once clear_btree_io_tree is + * called. + */ + smp_mb(); while (!RB_EMPTY_ROOT(&tree->state)) { struct rb_node *node; struct extent_state *state; @@ -226,25 +232,22 @@ loop: extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); + init_waitqueue_head(&cur_trans->pending_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ atomic_set(&cur_trans->use_count, 2); - cur_trans->have_free_bgs = 0; + atomic_set(&cur_trans->pending_ordered, 0); + cur_trans->flags = 0; cur_trans->start_time = get_seconds(); - cur_trans->dirty_bg_run = 0; + + memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT; cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); - cur_trans->delayed_refs.num_heads_ready = 0; - cur_trans->delayed_refs.pending_csums = 0; - cur_trans->delayed_refs.num_heads = 0; - cur_trans->delayed_refs.flushing = 0; - cur_trans->delayed_refs.run_delayed_start = 0; - cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -264,7 +267,6 @@ loop: INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->pending_chunks); INIT_LIST_HEAD(&cur_trans->switch_commits); - INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); INIT_LIST_HEAD(&cur_trans->dropped_roots); @@ -447,8 +449,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) } static struct btrfs_trans_handle * -start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, - enum btrfs_reserve_flush_enum flush) +start_transaction(struct btrfs_root *root, unsigned int num_items, + unsigned int type, enum btrfs_reserve_flush_enum flush) { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; @@ -478,13 +480,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, * the appropriate flushing if need be. */ if (num_items > 0 && root != root->fs_info->chunk_root) { - if (root->fs_info->quota_enabled && - is_fstree(root->root_key.objectid)) { - qgroup_reserved = num_items * root->nodesize; - ret = btrfs_qgroup_reserve(root, qgroup_reserved); - if (ret) - return ERR_PTR(ret); - } + qgroup_reserved = num_items * root->nodesize; + ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved); + if (ret) + return ERR_PTR(ret); num_bytes = btrfs_calc_trans_metadata_size(root, num_items); /* @@ -502,7 +501,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, goto reserve_fail; } again: - h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) { ret = -ENOMEM; goto alloc_fail; @@ -543,26 +542,13 @@ again: h->transid = cur_trans->transid; h->transaction = cur_trans; - h->blocks_used = 0; - h->bytes_reserved = 0; - h->chunk_bytes_reserved = 0; h->root = root; - h->delayed_ref_updates = 0; h->use_count = 1; - h->adding_csums = 0; - h->block_rsv = NULL; - h->orig_rsv = NULL; - h->aborted = 0; - h->qgroup_reserved = 0; - h->delayed_ref_elem.seq = 0; + h->type = type; - h->allocating_chunk = false; h->can_flush_pending_bgs = true; - h->reloc_reserved = false; - h->sync = false; INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); - INIT_LIST_HEAD(&h->ordered); smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && @@ -579,7 +565,6 @@ again: h->bytes_reserved = num_bytes; h->reloc_reserved = reloc_reserved; } - h->qgroup_reserved = qgroup_reserved; got_it: btrfs_record_root_in_trans(h, root); @@ -597,20 +582,20 @@ alloc_fail: btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, num_bytes); reserve_fail: - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); + btrfs_qgroup_free_meta(root, qgroup_reserved); return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items) + unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_ALL); } struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, int num_items) + struct btrfs_root *root, + unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_LIMIT); @@ -794,12 +779,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - if (!list_empty(&trans->ordered)) { - spin_lock(&info->trans_lock); - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); - spin_unlock(&info->trans_lock); - } - trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = @@ -815,15 +794,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, must_run_delayed_refs = 2; } - if (trans->qgroup_reserved) { - /* - * the same root has to be passed here between start_transaction - * and end_transaction. Subvolume quota depends on this. - */ - btrfs_qgroup_free(trans->root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } - btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; @@ -856,6 +826,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, atomic_dec(&cur_trans->num_writers); extwriter_counter_dec(cur_trans, trans->type); + /* + * Make sure counter is updated before we wake up waiters. + */ smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); @@ -1238,6 +1211,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, spin_lock(&fs_info->fs_roots_radix_lock); if (err) break; + btrfs_qgroup_free_meta_all(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); @@ -1795,25 +1769,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) } static inline void -btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, - struct btrfs_fs_info *fs_info) +btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) { - struct btrfs_ordered_extent *ordered; - - spin_lock(&fs_info->trans_lock); - while (!list_empty(&cur_trans->pending_ordered)) { - ordered = list_first_entry(&cur_trans->pending_ordered, - struct btrfs_ordered_extent, - trans_list); - list_del_init(&ordered->trans_list); - spin_unlock(&fs_info->trans_lock); - - wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, - &ordered->flags)); - btrfs_put_ordered_extent(ordered); - spin_lock(&fs_info->trans_lock); - } - spin_unlock(&fs_info->trans_lock); + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); } int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -1842,10 +1801,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); trans->block_rsv = NULL; - if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } cur_trans = trans->transaction; @@ -1865,7 +1820,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } - if (!cur_trans->dirty_bg_run) { + if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; /* this mutex is also taken before trying to set @@ -1874,18 +1829,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * after a extents from that block group have been * allocated for cache files. btrfs_set_block_group_ro * will wait for the transaction to commit if it - * finds dirty_bg_run = 1 + * finds BTRFS_TRANS_DIRTY_BG_RUN set. * - * The dirty_bg_run flag is also used to make sure only - * one process starts all the block group IO. It wouldn't + * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure + * only one process starts all the block group IO. It wouldn't * hurt to have more than one go through, but there's no * real advantage to it either. */ mutex_lock(&root->fs_info->ro_block_group_mutex); - if (!cur_trans->dirty_bg_run) { + if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, + &cur_trans->flags)) run_it = 1; - cur_trans->dirty_bg_run = 1; - } mutex_unlock(&root->fs_info->ro_block_group_mutex); if (run_it) @@ -1897,7 +1851,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } spin_lock(&root->fs_info->trans_lock); - list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1956,7 +1909,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_wait_delalloc_flush(root->fs_info); - btrfs_wait_pending_ordered(cur_trans, root->fs_info); + btrfs_wait_pending_ordered(cur_trans); btrfs_scrub_pause(root); /* @@ -2136,7 +2089,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_transaction(trans, root); if (ret) { - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Error while writing out transaction"); mutex_unlock(&root->fs_info->tree_log_mutex); goto scrub_continue; @@ -2156,7 +2109,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root); - if (cur_trans->have_free_bgs) + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(root->fs_info); root->fs_info->last_trans_committed = cur_trans->transid; @@ -2198,10 +2151,6 @@ cleanup_transaction: btrfs_trans_release_metadata(trans, root); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; - if (trans->qgroup_reserved) { - btrfs_qgroup_free(root, trans->qgroup_reserved); - trans->qgroup_reserved = 0; - } btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index a994bb097ee5..b05b2f64d913 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -32,6 +32,10 @@ enum btrfs_trans_state { TRANS_STATE_MAX = 6, }; +#define BTRFS_TRANS_HAVE_FREE_BGS 0 +#define BTRFS_TRANS_DIRTY_BG_RUN 1 +#define BTRFS_TRANS_CACHE_ENOSPC 2 + struct btrfs_transaction { u64 transid; /* @@ -46,11 +50,9 @@ struct btrfs_transaction { */ atomic_t num_writers; atomic_t use_count; + atomic_t pending_ordered; - /* - * true if there is free bgs operations in this transaction - */ - int have_free_bgs; + unsigned long flags; /* Be protected by fs_info->trans_lock when we want to change it. */ enum btrfs_trans_state state; @@ -59,9 +61,9 @@ struct btrfs_transaction { unsigned long start_time; wait_queue_head_t writer_wait; wait_queue_head_t commit_wait; + wait_queue_head_t pending_wait; struct list_head pending_snapshots; struct list_head pending_chunks; - struct list_head pending_ordered; struct list_head switch_commits; struct list_head dirty_bgs; struct list_head io_bgs; @@ -80,7 +82,6 @@ struct btrfs_transaction { spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; - int dirty_bg_run; }; #define __TRANS_FREEZABLE (1U << 0) @@ -107,7 +108,6 @@ struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; u64 chunk_bytes_reserved; - u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; unsigned long blocks_used; @@ -129,7 +129,6 @@ struct btrfs_trans_handle { */ struct btrfs_root *root; struct seq_list delayed_ref_elem; - struct list_head ordered; struct list_head qgroup_ref_list; struct list_head new_bgs; }; @@ -185,9 +184,10 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_items); + unsigned int num_items); struct btrfs_trans_handle *btrfs_start_transaction_lflush( - struct btrfs_root *root, int num_items); + struct btrfs_root *root, + unsigned int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1bbaace73383..323e12cc9d2f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root) void btrfs_end_log_trans(struct btrfs_root *root) { if (atomic_dec_and_test(&root->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&root->log_writer_wait)) wake_up(&root->log_writer_wait); } @@ -691,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, 0, root->root_key.objectid, - key->objectid, offset, 0); + key->objectid, offset); if (ret) goto out; } else { @@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { - smp_mb(); + /* + * Implicit memory barrier after atomic_dec_and_test + */ if (waitqueue_active(&log_root_tree->log_writer_wait)) wake_up(&log_root_tree->log_writer_wait); } @@ -2950,6 +2954,9 @@ out_wake_log_root: atomic_set(&log_root_tree->log_commit[index2], 0); mutex_unlock(&log_root_tree->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) wake_up(&log_root_tree->log_commit_wait[index2]); out: @@ -2961,6 +2968,9 @@ out: atomic_set(&root->log_commit[index1], 0); mutex_unlock(&root->log_mutex); + /* + * The barrier before waitqueue_active is implied by mutex_unlock + */ if (waitqueue_active(&root->log_commit_wait[index1])) wake_up(&root->log_commit_wait[index1]); return ret; @@ -5314,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_error(fs_info, ret, "Failed to pin buffers while " + btrfs_std_error(fs_info, ret, "Failed to pin buffers while " "recovering log root tree."); goto error; } @@ -5328,7 +5338,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't find tree log root."); goto error; } @@ -5346,7 +5356,7 @@ again: log = btrfs_read_fs_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_error(fs_info, ret, + btrfs_std_error(fs_info, ret, "Couldn't read tree log root."); goto error; } @@ -5361,7 +5371,7 @@ again: free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); - btrfs_error(fs_info, ret, "Couldn't read target root " + btrfs_std_error(fs_info, ret, "Couldn't read target root " "for tree log recovery."); goto error; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e023919b4470..9b2dafa5ba59 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -42,6 +42,82 @@ #include "dev-replace.h" #include "sysfs.h" +const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = { + .sub_stripes = 2, + .dev_stripes = 1, + .devs_max = 0, /* 0 == as many as possible */ + .devs_min = 4, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_RAID1] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 2, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 2, + .ncopies = 2, + }, + [BTRFS_RAID_DUP] = { + .sub_stripes = 1, + .dev_stripes = 2, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID0] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_SINGLE] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 1, + .devs_min = 1, + .tolerated_failures = 0, + .devs_increment = 1, + .ncopies = 1, + }, + [BTRFS_RAID_RAID5] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 2, + .tolerated_failures = 1, + .devs_increment = 1, + .ncopies = 2, + }, + [BTRFS_RAID_RAID6] = { + .sub_stripes = 1, + .dev_stripes = 1, + .devs_max = 0, + .devs_min = 3, + .tolerated_failures = 2, + .devs_increment = 1, + .ncopies = 3, + }, +}; + +const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, + [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, + [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, + [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0, + [BTRFS_RAID_SINGLE] = 0, + [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5, + [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, +}; + static int init_first_rw_device(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_device *device); @@ -198,7 +274,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, if (IS_ERR(*bdev)) { ret = PTR_ERR(*bdev); - printk(KERN_INFO "BTRFS: open %s failed\n", device_path); goto error; } @@ -211,8 +286,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, } invalidate_bdev(*bdev); *bh = btrfs_read_dev_super(*bdev); - if (!*bh) { - ret = -EINVAL; + if (IS_ERR(*bh)) { + ret = PTR_ERR(*bh); blkdev_put(*bdev, flags); goto error; } @@ -345,6 +420,9 @@ loop_lock: pending = pending->bi_next; cur->bi_next = NULL; + /* + * atomic_dec_return implies a barrier for waitqueue_active + */ if (atomic_dec_return(&fs_info->nr_async_bios) < limit && waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); @@ -765,36 +843,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; - - if (device->writeable && - device->devid != BTRFS_DEV_REPLACE_DEVID) { - list_del_init(&device->dev_alloc_list); - fs_devices->rw_devices--; - } - - if (device->missing) - fs_devices->missing_devices--; - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); - } - - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; - - call_rcu(&device->rcu, free_device); + btrfs_close_one_device(device); } mutex_unlock(&fs_devices->device_list_mutex); @@ -1402,7 +1451,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_error(root->fs_info, ret, "Slot search failed"); + btrfs_std_error(root->fs_info, ret, "Slot search failed"); goto out; } @@ -1410,10 +1459,10 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to remove dev extent item"); } else { - trans->transaction->have_free_bgs = 1; + set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); } out: btrfs_free_path(path); @@ -1801,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1924,7 +1973,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, if (srcdev->writeable) { fs_devices->rw_devices--; /* zero out the old super if it is writable */ - btrfs_scratch_superblock(srcdev); + btrfs_scratch_superblocks(srcdev->bdev, + rcu_str_deref(srcdev->name)); } if (srcdev->bdev) @@ -1971,10 +2021,11 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); - btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); if (tgtdev->bdev) { - btrfs_scratch_superblock(tgtdev); + btrfs_scratch_superblocks(tgtdev->bdev, + rcu_str_deref(tgtdev->name)); fs_info->fs_devices->open_devices--; } fs_info->fs_devices->num_devices--; @@ -2041,10 +2092,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, } } - if (!*device) { - btrfs_err(root->fs_info, "no missing device found"); - return -ENOENT; - } + if (!*device) + return BTRFS_ERROR_DEV_MISSING_NOT_FOUND; return 0; } else { @@ -2309,7 +2358,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info->fs_devices, device); + btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2350,9 +2399,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj, fsid_buf)) - pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); + btrfs_warn(root->fs_info, + "sysfs: failed to create fsid for sprout"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2368,7 +2418,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_relocate_sys_chunks(root); if (ret < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to relocate sys chunks after " "device initialization. This can be fixed " "using the \"btrfs balance\" command."); @@ -2388,7 +2438,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2613,7 +2663,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_error(root->fs_info, -ENOENT, + btrfs_std_error(root->fs_info, -ENOENT, "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; @@ -2621,7 +2671,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_error(root->fs_info, ret, + btrfs_std_error(root->fs_info, ret, "Failed to delete chunk item."); out: btrfs_free_path(path); @@ -2806,7 +2856,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret); + btrfs_std_error(root->fs_info, ret, NULL); return ret; } @@ -3009,16 +3059,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl) * (albeit full) chunks. */ if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->data.usage = 90; } if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->sys.usage = 90; } if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && + !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; bctl->meta.usage = 90; @@ -3074,13 +3127,46 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_block_group_cache *cache; + u64 chunk_used; + u64 user_thresh_min; + u64 user_thresh_max; + int ret = 1; + + cache = btrfs_lookup_block_group(fs_info, chunk_offset); + chunk_used = btrfs_block_group_used(&cache->item); + + if (bargs->usage_min == 0) + user_thresh_min = 0; + else + user_thresh_min = div_factor_fine(cache->key.offset, + bargs->usage_min); + + if (bargs->usage_max == 0) + user_thresh_max = 1; + else if (bargs->usage_max > 100) + user_thresh_max = cache->key.offset; + else + user_thresh_max = div_factor_fine(cache->key.offset, + bargs->usage_max); + + if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) + ret = 0; + + btrfs_put_block_group(cache); + return ret; +} + +static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, + u64 chunk_offset, struct btrfs_balance_args *bargs) +{ + struct btrfs_block_group_cache *cache; u64 chunk_used, user_thresh; int ret = 1; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - if (bargs->usage == 0) + if (bargs->usage_min == 0) user_thresh = 1; else if (bargs->usage > 100) user_thresh = cache->key.offset; @@ -3170,6 +3256,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf, return 1; } +static int chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) +{ + int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + + if (bargs->stripes_min <= num_stripes + && num_stripes <= bargs->stripes_max) + return 0; + + return 1; +} + static int chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { @@ -3216,6 +3315,9 @@ static int should_balance_chunk(struct btrfs_root *root, if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { return 0; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && + chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) { + return 0; } /* devid filter */ @@ -3236,6 +3338,12 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; } + /* stripes filter */ + if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && + chunk_stripes_range_filter(leaf, chunk, bargs)) { + return 0; + } + /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { @@ -3250,6 +3358,16 @@ static int should_balance_chunk(struct btrfs_root *root, return 0; else bargs->limit--; + } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { + /* + * Same logic as the 'limit' filter; the minimum cannot be + * determined here because we do not have the global informatoin + * about the count of all chunks that satisfy the filters. + */ + if (bargs->limit_max == 0) + return 0; + else + bargs->limit_max--; } return 1; @@ -3264,6 +3382,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) struct btrfs_device *device; u64 old_size; u64 size_to_free; + u64 chunk_type; struct btrfs_chunk *chunk; struct btrfs_path *path; struct btrfs_key key; @@ -3274,9 +3393,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) int ret; int enospc_errors = 0; bool counting = true; + /* The single value limit and min/max limits use the same bytes in the */ u64 limit_data = bctl->data.limit; u64 limit_meta = bctl->meta.limit; u64 limit_sys = bctl->sys.limit; + u32 count_data = 0; + u32 count_meta = 0; + u32 count_sys = 0; /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; @@ -3317,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->balance_lock); again: if (!counting) { + /* + * The single value limit and min/max limits use the same bytes + * in the + */ bctl->data.limit = limit_data; bctl->meta.limit = limit_meta; bctl->sys.limit = limit_sys; @@ -3364,6 +3491,7 @@ again: } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); if (!counting) { spin_lock(&fs_info->balance_lock); @@ -3384,6 +3512,28 @@ again: spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); + + if (chunk_type & BTRFS_BLOCK_GROUP_DATA) + count_data++; + else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) + count_sys++; + else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) + count_meta++; + + goto loop; + } + + /* + * Apply limit_min filter, no need to check if the LIMITS + * filter is used, limit_min is 0 by default + */ + if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) && + count_data < bctl->data.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) && + count_meta < bctl->meta.limit_min) + || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && + count_sys < bctl->sys.limit_min)) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; } @@ -3461,11 +3611,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); if (ret) - btrfs_std_error(fs_info, ret); + btrfs_std_error(fs_info, ret, NULL); atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } +/* Non-zero return value signifies invalidity */ +static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg, + u64 allowed) +{ + return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) && + (!alloc_profile_is_valid(bctl_arg->target, 1) || + (bctl_arg->target & ~allowed))); +} + /* * Should be called with both balance and volume mutexes held */ @@ -3523,27 +3682,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (num_devices > 3) allowed |= (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID6); - if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->data.target, 1) || - (bctl->data.target & ~allowed))) { + if (validate_convert_profile(&bctl->data, allowed)) { btrfs_err(fs_info, "unable to start balance with target " "data profile %llu", bctl->data.target); ret = -EINVAL; goto out; } - if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->meta.target, 1) || - (bctl->meta.target & ~allowed))) { + if (validate_convert_profile(&bctl->meta, allowed)) { btrfs_err(fs_info, "unable to start balance with target metadata profile %llu", bctl->meta.target); ret = -EINVAL; goto out; } - if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (!alloc_profile_is_valid(bctl->sys.target, 1) || - (bctl->sys.target & ~allowed))) { + if (validate_convert_profile(&bctl->sys, allowed)) { btrfs_err(fs_info, "unable to start balance with target system profile %llu", bctl->sys.target); @@ -4285,65 +4438,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { - [BTRFS_RAID_RAID10] = { - .sub_stripes = 2, - .dev_stripes = 1, - .devs_max = 0, /* 0 == as many as possible */ - .devs_min = 4, - .devs_increment = 2, - .ncopies = 2, - }, - [BTRFS_RAID_RAID1] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 2, - .devs_min = 2, - .devs_increment = 2, - .ncopies = 2, - }, - [BTRFS_RAID_DUP] = { - .sub_stripes = 1, - .dev_stripes = 2, - .devs_max = 1, - .devs_min = 1, - .devs_increment = 1, - .ncopies = 2, - }, - [BTRFS_RAID_RAID0] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 2, - .devs_increment = 1, - .ncopies = 1, - }, - [BTRFS_RAID_SINGLE] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 1, - .devs_min = 1, - .devs_increment = 1, - .ncopies = 1, - }, - [BTRFS_RAID_RAID5] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 2, - .devs_increment = 1, - .ncopies = 2, - }, - [BTRFS_RAID_RAID6] = { - .sub_stripes = 1, - .dev_stripes = 1, - .devs_max = 0, - .devs_min = 3, - .devs_increment = 1, - .ncopies = 3, - }, -}; - static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) { /* TODO allow them to set a preferred stripe size */ @@ -6594,8 +6688,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "error %d while searching for dev_stats item for device %s!\n", + btrfs_warn_in_rcu(dev_root->fs_info, + "error %d while searching for dev_stats item for device %s", ret, rcu_str_deref(device->name)); goto out; } @@ -6605,8 +6699,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "delete too small dev_stats item for device %s failed %d!\n", + btrfs_warn_in_rcu(dev_root->fs_info, + "delete too small dev_stats item for device %s failed %d", rcu_str_deref(device->name), ret); goto out; } @@ -6619,9 +6713,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - printk_in_rcu(KERN_WARNING "BTRFS: " - "insert dev_stats item for device %s failed %d!\n", - rcu_str_deref(device->name), ret); + btrfs_warn_in_rcu(dev_root->fs_info, + "insert dev_stats item for device %s failed %d", + rcu_str_deref(device->name), ret); goto out; } } @@ -6675,8 +6769,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " - "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + btrfs_err_rl_in_rcu(dev->dev_root->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6695,8 +6789,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - printk_in_rcu(KERN_INFO "BTRFS: " - "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", + btrfs_info_in_rcu(dev->dev_root->fs_info, + "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), @@ -6740,22 +6834,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root, return 0; } -int btrfs_scratch_superblock(struct btrfs_device *device) +void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path) { struct buffer_head *bh; struct btrfs_super_block *disk_super; + int copy_num; - bh = btrfs_read_dev_super(device->bdev); - if (!bh) - return -EINVAL; - disk_super = (struct btrfs_super_block *)bh->b_data; + if (!bdev) + return; - memset(&disk_super->magic, 0, sizeof(disk_super->magic)); - set_buffer_dirty(bh); - sync_dirty_buffer(bh); - brelse(bh); + for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; + copy_num++) { - return 0; + if (btrfs_read_dev_one_super(bdev, copy_num, &bh)) + continue; + + disk_super = (struct btrfs_super_block *)bh->b_data; + + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + + /* Notify udev that device has changed */ + btrfs_kobject_uevent(bdev, KOBJ_CHANGE); + + /* Update ctime/mtime for device path for libblkid */ + update_dev_time(device_path); } /* @@ -6823,3 +6929,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) fs_devices = fs_devices->seed; } } + +void btrfs_close_one_device(struct btrfs_device *device) +{ + struct btrfs_fs_devices *fs_devices = device->fs_devices; + struct btrfs_device *new_device; + struct rcu_string *name; + + if (device->bdev) + fs_devices->open_devices--; + + if (device->writeable && + device->devid != BTRFS_DEV_REPLACE_DEVID) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + if (device->missing) + fs_devices->missing_devices--; + + new_device = btrfs_alloc_device(NULL, &device->devid, + device->uuid); + BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ + + /* Safe because we are under uuid_mutex */ + if (device->name) { + name = rcu_string_strdup(device->name->str, GFP_NOFS); + BUG_ON(!name); /* -ENOMEM */ + rcu_assign_pointer(new_device->name, name); + } + + list_replace_rcu(&device->dev_list, &new_device->dev_list); + new_device->fs_devices = device->fs_devices; + + call_rcu(&device->rcu, free_device); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 595279a8b99f..ec5712372732 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -256,7 +256,7 @@ struct btrfs_fs_devices { struct btrfs_fs_info *fs_info; /* sysfs kobjects */ - struct kobject super_kobj; + struct kobject fsid_kobj; struct kobject *device_dir_kobj; struct completion kobj_unregister; }; @@ -334,10 +334,15 @@ struct btrfs_raid_attr { int dev_stripes; /* stripes per dev */ int devs_max; /* max devs to use */ int devs_min; /* min devs needed */ + int tolerated_failures; /* max tolerated fail devs */ int devs_increment; /* ndevs has to be a multiple of this */ int ncopies; /* how many copies to data has */ }; +extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; + +extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; + struct map_lookup { u64 type; int io_align; @@ -375,6 +380,9 @@ struct map_lookup { #define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) #define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) #define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) +#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6) +#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7) +#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 8) #define BTRFS_BALANCE_ARGS_MASK \ (BTRFS_BALANCE_ARGS_PROFILES | \ @@ -382,7 +390,10 @@ struct map_lookup { BTRFS_BALANCE_ARGS_DEVID | \ BTRFS_BALANCE_ARGS_DRANGE | \ BTRFS_BALANCE_ARGS_VRANGE | \ - BTRFS_BALANCE_ARGS_LIMIT) + BTRFS_BALANCE_ARGS_LIMIT | \ + BTRFS_BALANCE_ARGS_LIMIT_RANGE | \ + BTRFS_BALANCE_ARGS_STRIPES_RANGE | \ + BTRFS_BALANCE_ARGS_USAGE_RANGE) /* * Profile changing flags. When SOFT is set we won't relocate chunk if @@ -482,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); -int btrfs_scratch_superblock(struct btrfs_device *device); +void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path); int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_root *root, @@ -555,5 +566,6 @@ static inline void unlock_chunks(struct btrfs_root *root) struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_close_one_device(struct btrfs_device *device); #endif diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b1eede3678a9..0557c45e9c33 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -84,7 +84,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); dentry = d_hash_and_lookup(parent, name); - if (unlikely(IS_ERR(dentry))) + if (IS_ERR(dentry)) return; if (dentry) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 3c4db1172d22..e2e47ba5d313 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -270,7 +270,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); - if (unlikely(IS_ERR(ecryptfs_inode))) { + if (IS_ERR(ecryptfs_inode)) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" "lower filesystem\n"); rc = PTR_ERR(ecryptfs_inode); diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 75285ea9aa05..f52cf54f0cbc 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o readpage.o + xattr_trusted.o inline.o readpage.o sysfs.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index cd6ea29be645..ec0668a60678 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -191,6 +191,7 @@ static int ext4_init_block_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { + ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -203,7 +204,7 @@ static int ext4_init_block_bitmap(struct super_block *sb, count); } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return -EIO; + return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); @@ -213,7 +214,7 @@ static int ext4_init_block_bitmap(struct super_block *sb, start = ext4_group_first_block_no(sb, block_group); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (ext4_has_feature_flex_bg(sb)) flex_bg = 1; /* Set bits for block and inode bitmaps, and inode table */ @@ -322,7 +323,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, ext4_fsblk_t blk; ext4_fsblk_t group_first_block; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + if (ext4_has_feature_flex_bg(sb)) { /* with FLEX_BG, the inode/block bitmaps and itable * blocks may not be in the group at all * so the bitmap validation will be skipped for those groups @@ -360,42 +361,45 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, return 0; } -static void ext4_validate_block_bitmap(struct super_block *sb, - struct ext4_group_desc *desc, - ext4_group_t block_group, - struct buffer_head *bh) +static int ext4_validate_block_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - return; + if (buffer_verified(bh)) + return 0; + if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; ext4_lock_group(sb, block_group); - blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); - if (unlikely(blk != 0)) { + if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, + desc, bh))) { ext4_unlock_group(sb, block_group); - ext4_error(sb, "bg %u: block %llu: invalid block bitmap", - block_group, blk); + ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - return; + return -EFSBADCRC; } - if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh))) { + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); - ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); + ext4_error(sb, "bg %u: block %llu: invalid block bitmap", + block_group, blk); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, grp->bb_free); set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - return; + return -EFSCORRUPTED; } set_buffer_verified(bh); ext4_unlock_group(sb, block_group); + return 0; } /** @@ -414,17 +418,18 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh; ext4_fsblk_t bitmap_blk; + int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) - return NULL; + return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_error(sb, "Cannot get buffer for block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); - return NULL; + return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) @@ -437,7 +442,6 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - int err; err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); @@ -445,7 +449,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) ext4_unlock_group(sb, block_group); unlock_buffer(bh); if (err) - ext4_error(sb, "Checksum bad for grp %u", block_group); + goto out; goto verify; } ext4_unlock_group(sb, block_group); @@ -468,11 +472,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) submit_bh(READ | REQ_META | REQ_PRIO, bh); return bh; verify: - ext4_validate_block_bitmap(sb, desc, block_group, bh); - if (buffer_verified(bh)) - return bh; + err = ext4_validate_block_bitmap(sb, desc, block_group, bh); + if (err) + goto out; + return bh; +out: put_bh(bh); - return NULL; + return ERR_PTR(err); } /* Returns 0 on success, 1 on error */ @@ -485,32 +491,32 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) - return 1; + return -EFSCORRUPTED; wait_on_buffer(bh); if (!buffer_uptodate(bh)) { ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); - return 1; + return -EIO; } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ - ext4_validate_block_bitmap(sb, desc, block_group, bh); - /* ...but check for error just in case errors=continue. */ - return !buffer_verified(bh); + return ext4_validate_block_bitmap(sb, desc, block_group, bh); } struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct buffer_head *bh; + int err; bh = ext4_read_block_bitmap_nowait(sb, block_group); - if (!bh) - return NULL; - if (ext4_wait_block_bitmap(sb, block_group, bh)) { + if (IS_ERR(bh)) + return bh; + err = ext4_wait_block_bitmap(sb, block_group, bh); + if (err) { put_bh(bh); - return NULL; + return ERR_PTR(err); } return bh; } @@ -681,8 +687,10 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); - if (bitmap_bh == NULL) + if (IS_ERR(bitmap_bh)) { + bitmap_bh = NULL; continue; + } x = ext4_count_free(bitmap_bh->b_data, EXT4_CLUSTERS_PER_GROUP(sb) / 8); @@ -740,14 +748,13 @@ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) if (group == 0) return 1; - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) { + if (ext4_has_feature_sparse_super2(sb)) { if (group == le32_to_cpu(es->s_backup_bgs[0]) || group == le32_to_cpu(es->s_backup_bgs[1])) return 1; return 0; } - if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) + if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) return 1; if (!(group & 1)) return 0; @@ -776,7 +783,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, if (!ext4_bg_has_super(sb, group)) return 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) + if (ext4_has_feature_meta_bg(sb)) return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); else return EXT4_SB(sb)->s_gdb_count; @@ -797,8 +804,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); - if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || - metagroup < first_meta_bg) + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); @@ -818,7 +824,7 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || + if (!ext4_has_feature_meta_bg(sb) || block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 3522340c7a99..02ddec6d8a7d 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -234,7 +234,7 @@ int ext4_check_blockref(const char *function, unsigned int line, es->s_last_error_block = cpu_to_le64(blk); ext4_error_inode(inode, function, line, blk, "invalid block"); - return -EIO; + return -EFSCORRUPTED; } } return 0; diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 45731558138c..af06830bfc00 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -253,8 +253,7 @@ typedef enum { EXT4_ENCRYPT, } ext4_direction_t; -static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, - struct inode *inode, +static int ext4_page_crypto(struct inode *inode, ext4_direction_t rw, pgoff_t index, struct page *src_page, @@ -296,7 +295,6 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, else res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -353,7 +351,7 @@ struct page *ext4_encrypt(struct inode *inode, if (IS_ERR(ciphertext_page)) goto errout; ctx->w.control_page = plaintext_page; - err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, + err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { ciphertext_page = ERR_PTR(err); @@ -378,31 +376,14 @@ struct page *ext4_encrypt(struct inode *inode, * * Return: Zero on success, non-zero otherwise. */ -int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page) +int ext4_decrypt(struct page *page) { BUG_ON(!PageLocked(page)); - return ext4_page_crypto(ctx, page->mapping->host, + return ext4_page_crypto(page->mapping->host, EXT4_DECRYPT, page->index, page, page); } -/* - * Convenience function which takes care of allocating and - * deallocating the encryption context - */ -int ext4_decrypt_one(struct inode *inode, struct page *page) -{ - int ret; - - struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = ext4_decrypt(ctx, page); - ext4_release_crypto_ctx(ctx); - return ret; -} - int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) { struct ext4_crypto_ctx *ctx; @@ -411,7 +392,13 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) ext4_lblk_t lblk = ex->ee_block; ext4_fsblk_t pblk = ext4_ext_pblock(ex); unsigned int len = ext4_ext_get_actual_len(ex); - int err = 0; + int ret, err = 0; + +#if 0 + ext4_msg(inode->i_sb, KERN_CRIT, + "ext4_encrypted_zeroout ino %lu lblk %u len %u", + (unsigned long) inode->i_ino, lblk, len); +#endif BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); @@ -426,7 +413,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) } while (len--) { - err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, + err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk, ZERO_PAGE(0), ciphertext_page); if (err) goto errout; @@ -437,17 +424,26 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) goto errout; } bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = pblk; - err = bio_add_page(bio, ciphertext_page, + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + ret = bio_add_page(bio, ciphertext_page, inode->i_sb->s_blocksize, 0); - if (err) { + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + ext4_msg(inode->i_sb, KERN_ERR, + "bio_add_page failed: %d", ret); + WARN_ON(1); bio_put(bio); + err = -EIO; goto errout; } err = submit_bio_wait(WRITE, bio); + if ((err == 0) && bio->bi_error) + err = -EIO; bio_put(bio); if (err) goto errout; + lblk++; pblk++; } err = 0; errout: diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index 847f919c84d9..2fbef8a14760 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -120,7 +120,6 @@ static int ext4_fname_encrypt(struct inode *inode, ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -182,7 +181,6 @@ static int ext4_fname_decrypt(struct inode *inode, ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); res = crypto_ablkcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 5c52c79dea46..c5882b36e558 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], EXT4_AES_256_XTS_KEY_SIZE, NULL); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } @@ -208,7 +207,12 @@ retry: goto out; } crypt_info->ci_keyring_key = keyring_key; - BUG_ON(keyring_key->type != &key_type_logon); + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "ext4: key type must be logon\n"); + res = -ENOKEY; + goto out; + } ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { res = -EINVAL; @@ -217,7 +221,13 @@ retry: master_key = (struct ext4_encryption_key *)ukp->data; BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); + if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "ext4: key size incorrect: %d\n", + master_key->size); + res = -ENOKEY; + goto out; + } res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); if (res) diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a640ec2c4b13..ad050698143f 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -150,7 +150,8 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, if ((parent == NULL) || (child == NULL)) { pr_err("parent %p child %p\n", parent, child); - BUG_ON(1); + WARN_ON(1); /* Should never happen */ + return 0; } /* no restrictions if the parent directory is not encrypted */ if (!ext4_encrypted_inode(parent)) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index f9e14911918c..1d1bca74f844 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -40,8 +40,7 @@ static int is_dx_dir(struct inode *inode) { struct super_block *sb = inode->i_sb; - if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX) && + if (ext4_has_feature_dir_index(inode->i_sb) && ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || ((inode->i_size >> sb->s_blocksize_bits) == 1) || ext4_has_inline_data(inode))) @@ -621,14 +620,14 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) - return -EIO; + return -EFSCORRUPTED; nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } if ((char *) de > top) - return -EIO; + return -EFSCORRUPTED; return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fd1f28be5296..750063f7a50c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -374,6 +374,7 @@ struct flex_groups { #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ +#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ @@ -431,6 +432,7 @@ enum { EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ + EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; @@ -475,6 +477,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(EOFBLOCKS); CHECK_FLAG_VALUE(INLINE_DATA); + CHECK_FLAG_VALUE(PROJINHERIT); CHECK_FLAG_VALUE(RESERVED); } @@ -692,6 +695,7 @@ struct ext4_inode { __le32 i_crtime; /* File Creation time */ __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ __le32 i_version_hi; /* high 32 bits for 64-bit version */ + __le32 i_projid; /* Project ID */ }; struct move_extent { @@ -1019,6 +1023,9 @@ struct ext4_inode_info { #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ +#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly + specified journal checksum */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ @@ -1179,7 +1186,9 @@ struct ext4_super_block { __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ __le32 s_lpf_ino; /* Location of the lost+found inode */ - __le32 s_reserved[100]; /* Padding to the end of the block */ + __le32 s_prj_quota_inum; /* inode for tracking project quota */ + __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ + __le32 s_reserved[98]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1522,6 +1531,7 @@ static inline int ext4_encrypted_inode(struct inode *inode) * Feature set definitions */ +/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */ #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ @@ -1566,6 +1576,7 @@ static inline int ext4_encrypted_inode(struct inode *inode) */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 +#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1578,11 +1589,99 @@ static inline int ext4_encrypted_inode(struct inode *inode) #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ -#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ +#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat |= \ + cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ + ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ +} + +#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ +static inline bool ext4_has_feature_##name(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ +} \ +static inline void ext4_set_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat |= \ + cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} \ +static inline void ext4_clear_feature_##name(struct super_block *sb) \ +{ \ + EXT4_SB(sb)->s_es->s_feature_incompat &= \ + ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ +} + +EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) +EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) +EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) +EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) +EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) +EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) +EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) + +EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) +EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) +EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) +EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) +EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) +EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) +EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) +EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) +EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) +EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) + +EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) +EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) +EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) +EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) +EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) +EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) +EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) +EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) +EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) +EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) +EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) +EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) +EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) + #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_META_BG) @@ -1598,7 +1697,7 @@ static inline int ext4_encrypted_inode(struct inode *inode) EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) -#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ @@ -1607,7 +1706,8 @@ static inline int ext4_encrypted_inode(struct inode *inode) EXT4_FEATURE_INCOMPAT_FLEX_BG| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ - EXT4_FEATURE_INCOMPAT_ENCRYPT) + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1619,6 +1719,40 @@ static inline int ext4_encrypted_inode(struct inode *inode) EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA) +#define EXTN_FEATURE_FUNCS(ver) \ +static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ + cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ +} \ +static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ +{ \ + return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ + cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ +} + +EXTN_FEATURE_FUNCS(2) +EXTN_FEATURE_FUNCS(3) +EXTN_FEATURE_FUNCS(4) + +static inline bool ext4_has_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_compat != 0); +} +static inline bool ext4_has_ro_compat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); +} +static inline bool ext4_has_incompat_features(struct super_block *sb) +{ + return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); +} + /* * Default values for user and/or group using reserved blocks */ @@ -1769,8 +1903,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) * (c) Daniel Phillips, 2001 */ -#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ - EXT4_FEATURE_COMPAT_DIR_INDEX) && \ +#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) @@ -2063,8 +2196,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, struct page *plaintext_page); -int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page); -int ext4_decrypt_one(struct inode *inode, struct page *page); +int ext4_decrypt(struct page *page); int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -2072,7 +2204,7 @@ int ext4_init_crypto(void); void ext4_exit_crypto(void); static inline int ext4_sb_has_crypto(struct super_block *sb) { - return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); + return ext4_has_feature_encrypt(sb); } #else static inline int ext4_init_crypto(void) { return 0; } @@ -2193,8 +2325,7 @@ int ext4_insert_dentry(struct inode *dir, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { - if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_COMPAT_DIR_INDEX)) + if (!ext4_has_feature_dir_index(inode->i_sb)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } static unsigned char ext4_filetype_table[] = { @@ -2203,8 +2334,7 @@ static unsigned char ext4_filetype_table[] = { static inline unsigned char get_dtype(struct super_block *sb, int filetype) { - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || - (filetype >= EXT4_FT_MAX)) + if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) return DT_UNKNOWN; return ext4_filetype_table[filetype]; @@ -2245,6 +2375,7 @@ extern int ext4_init_inode_table(struct super_block *sb, extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ +extern const struct file_operations ext4_seq_mb_groups_fops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; extern int ext4_mb_init(struct super_block *); @@ -2372,6 +2503,7 @@ extern int ext4_group_extend(struct super_block *sb, extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ +extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); @@ -2534,15 +2666,13 @@ extern int ext4_register_li_request(struct super_block *sb, static inline int ext4_has_group_desc_csum(struct super_block *sb) { - return EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM) || - (EXT4_SB(sb)->s_chksum_driver != NULL); + return ext4_has_feature_gdt_csum(sb) || + EXT4_SB(sb)->s_chksum_driver != NULL; } static inline int ext4_has_metadata_csum(struct super_block *sb) { - WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) && !EXT4_SB(sb)->s_chksum_driver); return (EXT4_SB(sb)->s_chksum_driver != NULL); @@ -2889,7 +3019,7 @@ static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { static inline void ext4_set_de_type(struct super_block *sb, struct ext4_dir_entry_2 *de, umode_t mode) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE)) + if (ext4_has_feature_filetype(sb)) de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } @@ -2903,6 +3033,12 @@ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; +/* sysfs.c */ +extern int ext4_register_sysfs(struct super_block *sb); +extern void ext4_unregister_sysfs(struct super_block *sb); +extern int __init ext4_init_sysfs(void); +extern void ext4_exit_sysfs(void); + /* block_validity */ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); @@ -3049,4 +3185,7 @@ extern void ext4_resize_end(struct super_block *sb); #endif /* __KERNEL__ */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ +#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ + #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d41843181818..e770c1ee4613 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) return 0; } + err = handle->h_err; if (!handle->h_transaction) { - err = jbd2_journal_stop(handle); - return handle->h_err ? handle->h_err : err; + rc = jbd2_journal_stop(handle); + return err ? err : rc; } sb = handle->h_transaction->t_journal->j_private; - err = handle->h_err; rc = jbd2_journal_stop(handle); if (!err) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 9c5b49fb281e..5f5846211095 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -34,8 +34,7 @@ */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - ? 20U : 8U) + (ext4_has_feature_extents(sb) ? 20U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -84,17 +83,16 @@ /* Amount of blocks needed for quota update - we know that the structure was * allocated so we need to update only data block */ #define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ - 1 : 0) + ext4_has_feature_quota(sb)) ? 1 : 0) /* Amount of blocks needed for quota insert/delete - we do some block writes * but inode, sb and group updates are done only once */ #define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + ext4_has_feature_quota(sb)) ?\ (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_INIT_REWRITE) : 0) #define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\ - EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\ + ext4_has_feature_quota(sb)) ?\ (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\ +3+DQUOT_DEL_REWRITE) : 0) #else diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2553aa8b608d..551353b1b17a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -442,7 +442,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, int depth, ext4_fsblk_t pblk) { const char *error_msg; - int max = 0; + int max = 0, err = -EFSCORRUPTED; if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) { error_msg = "invalid magic"; @@ -473,6 +473,7 @@ static int __ext4_ext_check(const char *function, unsigned int line, if (ext_depth(inode) != depth && !ext4_extent_block_csum_verify(inode, eh)) { error_msg = "extent tree corrupted"; + err = -EFSBADCRC; goto corrupted; } return 0; @@ -485,7 +486,7 @@ corrupted: le16_to_cpu(eh->eh_magic), le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), max, le16_to_cpu(eh->eh_depth), depth); - return -EIO; + return err; } #define ext4_ext_check(inode, eh, depth, pblk) \ @@ -899,7 +900,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, bh = read_extent_tree_block(inode, path[ppos].p_block, --i, flags); - if (unlikely(IS_ERR(bh))) { + if (IS_ERR(bh)) { ret = PTR_ERR(bh); goto err; } @@ -910,7 +911,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, put_bh(bh); EXT4_ERROR_INODE(inode, "ppos %d > depth %d", ppos, depth); - ret = -EIO; + ret = -EFSCORRUPTED; goto err; } path[ppos].p_bh = bh; @@ -959,7 +960,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, EXT4_ERROR_INODE(inode, "logical %d == ei_block %d!", logical, le32_to_cpu(curp->p_idx->ei_block)); - return -EIO; + return -EFSCORRUPTED; } if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) @@ -968,7 +969,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, "eh_entries %d >= eh_max %d!", le16_to_cpu(curp->p_hdr->eh_entries), le16_to_cpu(curp->p_hdr->eh_max)); - return -EIO; + return -EFSCORRUPTED; } if (logical > le32_to_cpu(curp->p_idx->ei_block)) { @@ -992,7 +993,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); - return -EIO; + return -EFSCORRUPTED; } ix->ei_block = cpu_to_le32(logical); @@ -1001,7 +1002,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); - return -EIO; + return -EFSCORRUPTED; } err = ext4_ext_dirty(handle, inode, curp); @@ -1042,7 +1043,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, * border from split point */ if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) { EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!"); - return -EIO; + return -EFSCORRUPTED; } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; @@ -1086,7 +1087,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, newblock = ablocks[--a]; if (unlikely(newblock == 0)) { EXT4_ERROR_INODE(inode, "newblock == 0!"); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); @@ -1112,7 +1113,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!", path[depth].p_hdr->eh_entries, path[depth].p_hdr->eh_max); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } /* start copy from next extent */ @@ -1151,7 +1152,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, k = depth - at - 1; if (unlikely(k < 0)) { EXT4_ERROR_INODE(inode, "k %d < 0!", k); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } if (k) @@ -1191,7 +1192,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, EXT4_ERROR_INODE(inode, "EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!", le32_to_cpu(path[i].p_ext->ee_block)); - err = -EIO; + err = -EFSCORRUPTED; goto cleanup; } /* start copy indexes */ @@ -1425,7 +1426,7 @@ static int ext4_ext_search_left(struct inode *inode, if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; @@ -1444,7 +1445,7 @@ static int ext4_ext_search_left(struct inode *inode, EXT4_ERROR_INODE(inode, "EXT_FIRST_EXTENT != ex *logical %d ee_block %d!", *logical, le32_to_cpu(ex->ee_block)); - return -EIO; + return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; @@ -1455,7 +1456,7 @@ static int ext4_ext_search_left(struct inode *inode, EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ? le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0, depth); - return -EIO; + return -EFSCORRUPTED; } } return 0; @@ -1465,7 +1466,7 @@ static int ext4_ext_search_left(struct inode *inode, EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); - return -EIO; + return -EFSCORRUPTED; } *logical = le32_to_cpu(ex->ee_block) + ee_len - 1; @@ -1495,7 +1496,7 @@ static int ext4_ext_search_right(struct inode *inode, if (unlikely(path == NULL)) { EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } depth = path->p_depth; *phys = 0; @@ -1514,7 +1515,7 @@ static int ext4_ext_search_right(struct inode *inode, EXT4_ERROR_INODE(inode, "first_extent(path[%d].p_hdr) != ex", depth); - return -EIO; + return -EFSCORRUPTED; } while (--depth >= 0) { ix = path[depth].p_idx; @@ -1522,7 +1523,7 @@ static int ext4_ext_search_right(struct inode *inode, EXT4_ERROR_INODE(inode, "ix != EXT_FIRST_INDEX *logical %d!", *logical); - return -EIO; + return -EFSCORRUPTED; } } goto found_extent; @@ -1532,7 +1533,7 @@ static int ext4_ext_search_right(struct inode *inode, EXT4_ERROR_INODE(inode, "logical %d < ee_block %d + ee_len %d!", *logical, le32_to_cpu(ex->ee_block), ee_len); - return -EIO; + return -EFSCORRUPTED; } if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) { @@ -1670,7 +1671,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode, if (unlikely(ex == NULL || eh == NULL)) { EXT4_ERROR_INODE(inode, "ex %p == NULL or eh %p == NULL", ex, eh); - return -EIO; + return -EFSCORRUPTED; } if (depth == 0) { @@ -1938,14 +1939,14 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); - return -EIO; + return -EFSCORRUPTED; } depth = ext_depth(inode); ex = path[depth].p_ext; eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EIO; + return -EFSCORRUPTED; } /* try to insert block into found extent and return */ @@ -2172,7 +2173,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, if (unlikely(path[depth].p_hdr == NULL)) { up_read(&EXT4_I(inode)->i_data_sem); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EIO; + err = -EFSCORRUPTED; break; } ex = path[depth].p_ext; @@ -2241,7 +2242,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, if (unlikely(es.es_len == 0)) { EXT4_ERROR_INODE(inode, "es.es_len == 0"); - err = -EIO; + err = -EFSCORRUPTED; break; } @@ -2264,7 +2265,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, "next extent == %u, next " "delalloc extent = %u", next, next_del); - err = -EIO; + err = -EFSCORRUPTED; break; } } @@ -2363,7 +2364,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, leaf = ext4_idx_pblock(path->p_idx); if (unlikely(path->p_hdr->eh_entries == 0)) { EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0"); - return -EIO; + return -EFSCORRUPTED; } err = ext4_ext_get_access(handle, inode, path); if (err) @@ -2612,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, eh = path[depth].p_hdr; if (unlikely(path[depth].p_hdr == NULL)) { EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - return -EIO; + return -EFSCORRUPTED; } /* find where to start removing */ ex = path[depth].p_ext; @@ -2666,7 +2667,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, "on extent %u:%u", start, end, ex_ee_block, ex_ee_block + ex_ee_len - 1); - err = -EIO; + err = -EFSCORRUPTED; goto out; } else if (a != ex_ee_block) { /* remove tail of the extent */ @@ -2841,7 +2842,7 @@ again: EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); - err = -EIO; + err = -EFSCORRUPTED; } goto out; } @@ -2920,7 +2921,7 @@ again: i = 0; if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) { - err = -EIO; + err = -EFSCORRUPTED; goto out; } } @@ -2978,7 +2979,7 @@ again: * Should be a no-op if we did IO above. */ cond_resched(); if (WARN_ON(i + 1 > depth)) { - err = -EIO; + err = -EFSCORRUPTED; break; } path[i + 1].p_bh = bh; @@ -3054,7 +3055,7 @@ void ext4_ext_init(struct super_block *sb) * possible initialization would be here */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(sb)) { #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS) printk(KERN_INFO "EXT4-fs: file extents enabled" #ifdef AGGRESSIVE_TEST @@ -3081,7 +3082,7 @@ void ext4_ext_init(struct super_block *sb) */ void ext4_ext_release(struct super_block *sb) { - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) + if (!ext4_has_feature_extents(sb)) return; #ifdef EXTENTS_STATS @@ -3345,7 +3346,7 @@ static int ext4_split_extent(handle_t *handle, if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); - return -EIO; + return -EFSCORRUPTED; } unwritten = ext4_ext_is_unwritten(ex); split_flag1 = 0; @@ -3558,6 +3559,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, max_zeroout = sbi->s_extent_max_zeroout_kb >> (inode->i_sb->s_blocksize_bits - 10); + if (ext4_encrypted_inode(inode)) + max_zeroout = 0; + /* If extent is less than s_max_zeroout_kb, zeroout directly */ if (max_zeroout && (ee_len <= max_zeroout)) { err = ext4_ext_zeroout(inode, ex); @@ -3970,7 +3974,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (!ex) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) map->m_lblk); - return -EIO; + return -EFSCORRUPTED; } } @@ -4308,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, "lblock: %lu, depth: %d pblock %lld", (unsigned long) map->m_lblk, depth, path[depth].p_block); - err = -EIO; + err = -EFSCORRUPTED; goto out2; } @@ -5271,7 +5275,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (depth == path->p_depth) { ex_start = path[depth].p_ext; if (!ex_start) - return -EIO; + return -EFSCORRUPTED; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); @@ -5411,7 +5415,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", (unsigned long) *iterator); - return -EIO; + return -EFSCORRUPTED; } if (SHIFT == SHIFT_LEFT && *iterator > le32_to_cpu(extent->ee_block)) { @@ -5792,7 +5796,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, int split = 0; path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); - if (unlikely(IS_ERR(path1))) { + if (IS_ERR(path1)) { *erp = PTR_ERR(path1); path1 = NULL; finish: @@ -5800,7 +5804,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, goto repeat; } path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); - if (unlikely(IS_ERR(path2))) { + if (IS_ERR(path2)) { *erp = PTR_ERR(path2); path2 = NULL; goto finish; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 26724aeece73..ac748b3af1c1 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1089,20 +1089,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, return nr_shrunk; } -static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) +int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v) { - return *pos ? NULL : SEQ_START_TOKEN; -} - -static void * -ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) -{ - return NULL; -} - -static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) -{ - struct ext4_sb_info *sbi = seq->private; + struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private); struct ext4_es_stats *es_stats = &sbi->s_es_stats; struct ext4_inode_info *ei, *max = NULL; unsigned int inode_cnt = 0; @@ -1143,45 +1132,6 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) return 0; } -static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations ext4_es_seq_shrinker_info_ops = { - .start = ext4_es_seq_shrinker_info_start, - .next = ext4_es_seq_shrinker_info_next, - .stop = ext4_es_seq_shrinker_info_stop, - .show = ext4_es_seq_shrinker_info_show, -}; - -static int -ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) -{ - int ret; - - ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = PDE_DATA(inode); - } - - return ret; -} - -static int -ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - -static const struct file_operations ext4_es_seq_shrinker_info_fops = { - .owner = THIS_MODULE, - .open = ext4_es_seq_shrinker_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = ext4_es_seq_shrinker_info_release, -}; - int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { int err; @@ -1210,10 +1160,6 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) if (err) goto err2; - if (sbi->s_proc) - proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, - &ext4_es_seq_shrinker_info_fops, sbi); - return 0; err2: @@ -1225,8 +1171,6 @@ err1: void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { - if (sbi->s_proc) - remove_proc_entry("es_shrinker_info", sbi->s_proc); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 691b52613ce4..f7aa24f4642d 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -172,4 +172,6 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); +extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); + #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 619bfc1fda8c..1b8024d26f65 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -64,7 +64,7 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap) } /* Initializes an uninitialized inode bitmap */ -static unsigned ext4_init_inode_bitmap(struct super_block *sb, +static int ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) @@ -89,7 +89,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, count); } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return 0; + return -EFSBADCRC; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); @@ -99,7 +99,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); - return EXT4_INODES_PER_GROUP(sb); + return 0; } void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) @@ -112,6 +112,42 @@ void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate) put_bh(bh); } +static int ext4_validate_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *desc, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (buffer_verified(bh)) + return 0; + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + return -EFSCORRUPTED; + + ext4_lock_group(sb, block_group); + blk = ext4_inode_bitmap(sb, desc); + if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + EXT4_INODES_PER_GROUP(sb) / 8)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " + "inode_bitmap = %llu", block_group, blk); + grp = ext4_get_group_info(sb, block_group); + if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + int count; + count = ext4_free_inodes_count(sb, desc); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + return -EFSBADCRC; + } + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + return 0; +} + /* * Read the inode allocation bitmap for a given block_group, reading * into the specified slot in the superblock's bitmap cache. @@ -124,12 +160,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) struct ext4_group_desc *desc; struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; - struct ext4_group_info *grp; - struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) - return NULL; + return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_inode_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); @@ -137,7 +172,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); - return NULL; + return ERR_PTR(-EIO); } if (bitmap_uptodate(bh)) goto verify; @@ -150,12 +185,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - ext4_init_inode_bitmap(sb, bh, block_group, desc); + err = ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); + if (err) + goto out; return bh; } ext4_unlock_group(sb, block_group); @@ -182,31 +219,17 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); - return NULL; + return ERR_PTR(-EIO); } verify: - ext4_lock_group(sb, block_group); - if (!buffer_verified(bh) && - !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, - EXT4_INODES_PER_GROUP(sb) / 8)) { - ext4_unlock_group(sb, block_group); - put_bh(bh); - ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " - "inode_bitmap = %llu", block_group, bitmap_blk); - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, desc); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return NULL; - } - ext4_unlock_group(sb, block_group); - set_buffer_verified(bh); + err = ext4_validate_inode_bitmap(sb, desc, block_group, bh); + if (err) + goto out; return bh; +out: + put_bh(bh); + return ERR_PTR(err); } /* @@ -286,8 +309,15 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) bitmap_bh = ext4_read_inode_bitmap(sb, block_group); /* Don't bother if the inode bitmap is corrupt. */ grp = ext4_get_group_info(sb, block_group); - if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh) + if (IS_ERR(bitmap_bh)) { + fatal = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; + goto error_return; + } + if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) { + fatal = -EFSCORRUPTED; goto error_return; + } BUFFER_TRACE(bitmap_bh, "get_write_access"); fatal = ext4_journal_get_write_access(handle, bitmap_bh); @@ -826,7 +856,9 @@ got_group: brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); /* Skip groups with suspicious inode tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) { + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || + IS_ERR(inode_bitmap_bh)) { + inode_bitmap_bh = NULL; if (++group == ngroups) group = 0; continue; @@ -902,8 +934,8 @@ got: struct buffer_head *block_bitmap_bh; block_bitmap_bh = ext4_read_block_bitmap(sb, group); - if (!block_bitmap_bh) { - err = -EIO; + if (IS_ERR(block_bitmap_bh)) { + err = PTR_ERR(block_bitmap_bh); goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); @@ -1045,7 +1077,7 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ei->i_inline_off = 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) + if (ext4_has_feature_inline_data(sb)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); @@ -1060,7 +1092,7 @@ got: if (err) goto fail_free_drop; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(sb)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); @@ -1116,14 +1148,17 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); + err = -EFSCORRUPTED; goto error; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (!bitmap_bh) { - ext4_warning(sb, "inode bitmap error for orphan %lu", ino); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + ext4_warning(sb, "inode bitmap error %ld for orphan %lu", + ino, err); goto error; } @@ -1198,8 +1233,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); - if (!bitmap_bh) + if (IS_ERR(bitmap_bh)) { + bitmap_bh = NULL; continue; + } x = ext4_count_free(bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb) / 8); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 2468261748b2..355ef9c36c87 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -562,11 +562,10 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, /* * Okay, we need to do block allocation. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(inode->i_sb)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); - return -EUCLEAN; + return -EFSCORRUPTED; } /* Set up for the direct block allocation */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cd944a7a99cd..d884989cc83d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -434,8 +434,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, memset((void *)ext4_raw_inode(&is.iloc)->i_block, 0, EXT4_MIN_INLINE_DATA_SIZE); - if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(inode->i_sb)) { if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 60aaecd5598b..7d1aad1d9313 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -378,7 +378,7 @@ static int __check_block_validity(struct inode *inode, const char *func, "lblock %lu mapped to illegal pblock " "(length %d)", (unsigned long) map->m_lblk, map->m_len); - return -EIO; + return -EFSCORRUPTED; } return 0; } @@ -480,7 +480,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) - return -EIO; + return -EFSCORRUPTED; /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { @@ -965,7 +965,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = ext4_decrypt_one(inode, page); + err = ext4_decrypt(page); return err; } #endif @@ -1181,6 +1181,38 @@ errout: return ret ? ret : copied; } +/* + * This is a private version of page_zero_new_buffers() which doesn't + * set the buffer to be dirty, since in data=journalled mode we need + * to call ext4_handle_dirty_metadata() instead. + */ +static void zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start = 0, block_end; + struct buffer_head *head, *bh; + + bh = head = page_buffers(page); + do { + block_end = block_start + bh->b_size; + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, size; + + start = max(from, block_start); + size = min(to, block_end) - start; + + zero_user(page, start, size); + set_buffer_uptodate(bh); + } + clear_buffer_new(bh); + } + } + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} + static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -1207,7 +1239,7 @@ static int ext4_journalled_write_end(struct file *file, if (copied < len) { if (!PageUptodate(page)) copied = 0; - page_zero_new_buffers(page, from+copied, to); + zero_new_buffers(page, from+copied, to); } ret = ext4_walk_page_buffers(handle, page_buffers(page), from, @@ -1815,11 +1847,22 @@ static int ext4_writepage(struct page *page, * the page. But we may reach here when we do a journal commit via * journal_submit_inode_data_buffers() and in that case we must write * allocated buffers to achieve data=ordered mode guarantees. + * + * Also, if there is only one buffer per page (the fs block + * size == the page size), if one buffer needs block + * allocation or needs to modify the extent tree to clear the + * unwritten flag, we know that the page can't be written at + * all, so we might as well refuse the write immediately. + * Unfortunately if the block size != page size, we can't as + * easily detect this case using ext4_walk_page_buffers(), but + * for the extremely common case, this is an optimization that + * skips a useless round trip through ext4_bio_write_page(). */ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); - if (current->flags & PF_MEMALLOC) { + if ((current->flags & PF_MEMALLOC) || + (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) { /* * For memory cleaning there's no point in writing only * some buffers. So just bail out. Warn if we came here @@ -2599,8 +2642,7 @@ static int ext4_nonda_switch(struct super_block *sb) /* We always reserve for an inode update; the superblock could be there too */ static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) { - if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE))) + if (likely(ext4_has_feature_large_file(inode->i_sb))) return 1; if (pos + len <= 0x7fffffffULL) @@ -3393,7 +3435,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!ext4_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_CACHE_SIZE); - WARN_ON_ONCE(ext4_decrypt_one(inode, page)); + WARN_ON_ONCE(ext4_decrypt(page)); } } if (ext4_should_journal_data(inode)) { @@ -3820,7 +3862,7 @@ static int __ext4_get_inode_loc(struct inode *inode, iloc->bh = NULL; if (!ext4_valid_inum(sb, inode->i_ino)) - return -EIO; + return -EFSCORRUPTED; iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); @@ -4006,8 +4048,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); @@ -4068,7 +4109,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); - ret = -EIO; + ret = -EFSCORRUPTED; goto bad_inode; } } else @@ -4088,7 +4129,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (!ext4_inode_csum_verify(inode, raw_inode, ei)) { EXT4_ERROR_INODE(inode, "checksum invalid"); - ret = -EIO; + ret = -EFSBADCRC; goto bad_inode; } @@ -4130,7 +4171,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_flags = le32_to_cpu(raw_inode->i_flags); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) + if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); @@ -4203,7 +4244,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { EXT4_ERROR_INODE(inode, "bad extended attribute block %llu", ei->i_file_acl); - ret = -EIO; + ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { @@ -4254,7 +4295,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { - ret = -EIO; + ret = -EFSCORRUPTED; EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode); goto bad_inode; } @@ -4272,7 +4313,7 @@ bad_inode: struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) { if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); return ext4_iget(sb, ino); } @@ -4294,7 +4335,7 @@ static int ext4_inode_blocks_set(handle_t *handle, ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + if (!ext4_has_feature_huge_file(sb)) return -EFBIG; if (i_blocks <= 0xffffffffffffULL) { @@ -4455,8 +4496,7 @@ static int ext4_do_update_inode(handle_t *handle, need_datasync = 1; } if (ei->i_disksize > 0x7fffffffULL) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE) || + if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; @@ -4505,8 +4545,7 @@ static int ext4_do_update_inode(handle_t *handle, if (err) goto out_brelse; ext4_update_dynamic_rev(sb); - EXT4_SET_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + ext4_set_feature_large_file(sb); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 1346cfa355d0..5e872fd40e5e 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -145,8 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb, inode_bl->i_version = 1; i_size_write(inode_bl, 0); inode_bl->i_mode = S_IFREG; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_extents(sb)) { ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode_bl); } else @@ -383,8 +382,7 @@ setversion_out: goto group_extend_out; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; @@ -432,8 +430,7 @@ group_extend_out: goto mext_out; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); err = -EOPNOTSUPP; @@ -470,8 +467,7 @@ mext_out: goto group_add_out; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; @@ -553,8 +549,7 @@ group_add_out: int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { + if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not (yet) supported with bigalloc"); return -EOPNOTSUPP; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 34b610ea5030..b4b3c1f91814 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -874,8 +874,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore) bh[i] = NULL; continue; } - if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) { - err = -ENOMEM; + bh[i] = ext4_read_block_bitmap_nowait(sb, group); + if (IS_ERR(bh[i])) { + err = PTR_ERR(bh[i]); + bh[i] = NULL; goto out; } mb_debug(1, "read bitmap for group %u\n", group); @@ -883,8 +885,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { - if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) - err = -EIO; + int err2; + + if (!bh[i]) + continue; + err2 = ext4_wait_block_bitmap(sb, group, bh[i]); + if (!err) + err = err2; } first_block = page->index * blocks_per_page; @@ -2333,7 +2340,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) } -static const struct file_operations ext4_mb_seq_groups_fops = { +const struct file_operations ext4_seq_mb_groups_fops = { .owner = THIS_MODULE, .open = ext4_mb_seq_groups_open, .read = seq_read, @@ -2447,7 +2454,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, kmalloc(sb->s_blocksize, GFP_NOFS); BUG_ON(meta_group_info[i]->bb_bitmap == NULL); bh = ext4_read_block_bitmap(sb, group); - BUG_ON(bh == NULL); + BUG_ON(IS_ERR_OR_NULL(bh)); memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); @@ -2661,10 +2668,6 @@ int ext4_mb_init(struct super_block *sb) if (ret != 0) goto out_free_locality_groups; - if (sbi->s_proc) - proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_groups_fops, sb); - return 0; out_free_locality_groups: @@ -2705,9 +2708,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); - if (sbi->s_proc) - remove_proc_entry("mb_groups", sbi->s_proc); - if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2896,10 +2896,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - err = -EIO; bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); - if (!bitmap_bh) + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; goto out_err; + } BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, bitmap_bh); @@ -3331,8 +3333,8 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, atomic_inc(&pa->pa_count); return pa; } - cur_distance = abs(goal_block - cpa->pa_pstart); - new_distance = abs(goal_block - pa->pa_pstart); + cur_distance = abs64(goal_block - cpa->pa_pstart); + new_distance = abs64(goal_block - pa->pa_pstart); if (cur_distance <= new_distance) return cpa; @@ -3843,8 +3845,10 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, return 0; bitmap_bh = ext4_read_block_bitmap(sb, group); - if (bitmap_bh == NULL) { - ext4_error(sb, "Error reading block bitmap for %u", group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + ext4_error(sb, "Error %d reading block bitmap for %u", + err, group); return 0; } @@ -4015,9 +4019,10 @@ repeat: } bitmap_bh = ext4_read_block_bitmap(sb, group); - if (bitmap_bh == NULL) { - ext4_error(sb, "Error reading block bitmap for %u", - group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + ext4_error(sb, "Error %d reading block bitmap for %u", + err, group); ext4_mb_unload_buddy(&e4b); continue; } @@ -4682,22 +4687,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_debug("freeing block %llu\n", block); trace_ext4_free_blocks(inode, block, count, flags); - if (flags & EXT4_FREE_BLOCKS_FORGET) { - struct buffer_head *tbh = bh; - int i; - - BUG_ON(bh && (count > 1)); + if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { + BUG_ON(count > 1); - for (i = 0; i < count; i++) { - cond_resched(); - if (!bh) - tbh = sb_find_get_block(inode->i_sb, - block + i); - if (!tbh) - continue; - ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, - inode, tbh, block + i); - } + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, bh, block); } /* @@ -4742,6 +4736,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, count += sbi->s_cluster_ratio - overflow; } + if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { + int i; + + for (i = 0; i < count; i++) { + cond_resched(); + bh = sb_find_get_block(inode->i_sb, block + i); + if (!bh) + continue; + ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, + inode, bh, block + i); + } + } + do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -4761,8 +4768,9 @@ do_more: } count_clusters = EXT4_NUM_B2C(sbi, count); bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) { - err = -EIO; + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; goto error_return; } gdp = ext4_get_group_desc(sb, block_group, &gd_bh); @@ -4931,8 +4939,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, } bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (!bitmap_bh) { - err = -EIO; + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); + bitmap_bh = NULL; goto error_return; } diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 6163ad21cb0e..a4651894cc33 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -448,8 +448,7 @@ int ext4_ext_migrate(struct inode *inode) * If the filesystem does not support extents, or the inode * already is extent-based, error out. */ - if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS) || + if (!ext4_has_feature_extents(inode->i_sb) || (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; @@ -625,13 +624,11 @@ int ext4_ind_migrate(struct inode *inode) handle_t *handle; int ret; - if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_INCOMPAT_EXTENTS) || + if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EINVAL; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + if (ext4_has_feature_bigalloc(inode->i_sb)) return -EOPNOTSUPP; /* diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 6eb1a619890c..0a512aa81bf7 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -98,10 +98,12 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, } mmp = (struct mmp_struct *)((*bh)->b_data); - if (le32_to_cpu(mmp->mmp_magic) == EXT4_MMP_MAGIC && - ext4_mmp_csum_verify(sb, mmp)) + if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC) + ret = -EFSCORRUPTED; + else if (!ext4_mmp_csum_verify(sb, mmp)) + ret = -EFSBADCRC; + else return 0; - ret = -EINVAL; warn_exit: ext4_warning(sb, "Error %d while reading MMP block %llu", diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9f61e7679a6d..a969ab39f302 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -109,7 +109,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (!bh) { ext4_error_inode(inode, func, line, block, "Directory hole found"); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ @@ -124,7 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (!is_dx_block && type == INDEX) { ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } if (!ext4_has_metadata_csum(inode->i_sb) || buffer_verified(bh)) @@ -142,7 +142,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_error_inode(inode, func, line, block, "Directory index failed checksum"); brelse(bh); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSBADCRC); } } if (!is_dx_block) { @@ -152,7 +152,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, ext4_error_inode(inode, func, line, block, "Directory block failed checksum"); brelse(bh); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSBADCRC); } } return bh; @@ -1429,7 +1429,7 @@ restart: } num++; bh = ext4_getblk(NULL, dir, b++, 0); - if (unlikely(IS_ERR(bh))) { + if (IS_ERR(bh)) { if (ra_max == 0) { ret = bh; goto cleanup_and_exit; @@ -1570,19 +1570,19 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { EXT4_ERROR_INODE(dir, "bad inode number: %u", ino); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } if (unlikely(ino == dir->i_ino)) { EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir", dentry); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } inode = ext4_iget_normal(dir->i_sb, ino); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", ino); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || @@ -1619,7 +1619,7 @@ struct dentry *ext4_get_parent(struct dentry *child) if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) { EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); - return ERR_PTR(-EIO); + return ERR_PTR(-EFSCORRUPTED); } return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino)); @@ -1807,7 +1807,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, buf, buf_size, offset)) { - res = -EIO; + res = -EFSCORRUPTED; goto return_result; } /* Provide crypto context and crypto buffer to ext4 match */ @@ -1967,7 +1967,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, if ((char *) de >= (((char *) root) + blocksize)) { EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); brelse(bh); - return -EIO; + return -EFSCORRUPTED; } len = ((char *) root) + (blocksize - csum_size) - (char *) de; @@ -2118,7 +2118,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, goto out; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + ext4_has_feature_dir_index(sb)) { retval = make_indexed_dir(handle, &fname, dentry, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ @@ -2315,7 +2315,7 @@ int ext4_generic_delete_entry(handle_t *handle, while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, i)) - return -EIO; + return -EFSCORRUPTED; if (de == de_del) { if (pde) pde->rec_len = ext4_rec_len_to_disk( @@ -2388,8 +2388,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode) /* limit is 16-bit i_links_count */ if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) { set_nlink(inode, 1); - EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_DIR_NLINK); + ext4_set_feature_dir_nlink(inode->i_sb); } } } @@ -2469,9 +2468,6 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err, credits, retries = 0; - if (!new_valid_dev(rdev)) - return -EINVAL; - err = dquot_initialize(dir); if (err) return err; @@ -2934,7 +2930,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) inode = d_inode(dentry); - retval = -EIO; + retval = -EFSCORRUPTED; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_rmdir; @@ -3008,7 +3004,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) inode = d_inode(dentry); - retval = -EIO; + retval = -EFSCORRUPTED; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_unlink; @@ -3310,7 +3306,7 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) if (!ent->dir_bh) return retval; if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) - return -EIO; + return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); return ext4_journal_get_write_access(handle, ent->dir_bh); } @@ -3352,8 +3348,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (retval) return retval; ent->de->inode = cpu_to_le32(ino); - if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb, - EXT4_FEATURE_INCOMPAT_FILETYPE)) + if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; ent->dir->i_version++; ent->dir->i_ctime = ent->dir->i_mtime = diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 84ba4d2b3a35..17fbe3882b8e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -425,6 +425,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct buffer_head *bh, *head; int ret = 0; int nr_submitted = 0; + int nr_to_submit = 0; blocksize = 1 << inode->i_blkbits; @@ -477,11 +478,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); } set_buffer_async_write(bh); + nr_to_submit++; } while ((bh = bh->b_this_page) != head); bh = head = page_buffers(page); - if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && + nr_to_submit) { data_page = ext4_encrypt(inode, page); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 1061611ae14d..5dc5e95063de 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -62,7 +62,7 @@ static void completion_pages(struct work_struct *work) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - int ret = ext4_decrypt(ctx, page); + int ret = ext4_decrypt(page); if (ret) { WARN_ON_ONCE(1); SetPageError(page); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index cf0c472047e3..ad62d7acc315 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -490,7 +490,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, group_data[0].group != sbi->s_groups_count); reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks); - meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + meta_bg = ext4_has_feature_meta_bg(sb); /* This transaction may be extended/restarted along the way */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA); @@ -680,8 +680,7 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three, int mult = 3; unsigned ret; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + if (!ext4_has_feature_sparse_super(sb)) { ret = *min; *min += 1; return ret; @@ -1040,7 +1039,7 @@ exit_free: * do not copy the full number of backups at this time. The resize * which changed s_groups_count will backup again. */ -static void update_backups(struct super_block *sb, int blk_off, char *data, +static void update_backups(struct super_block *sb, sector_t blk_off, char *data, int size, int meta_bg) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1065,7 +1064,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data, group = ext4_list_backups(sb, &three, &five, &seven); last = sbi->s_groups_count; } else { - group = ext4_meta_bg_first_group(sb, group) + 1; + group = ext4_get_group_number(sb, blk_off) + 1; last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2); } @@ -1158,7 +1157,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, int i, gdb_off, gdb_num, err = 0; int meta_bg; - meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + meta_bg = ext4_has_feature_meta_bg(sb); for (i = 0; i < count; i++, group++) { int reserved_gdb = ext4_bg_has_super(sb, group) ? le16_to_cpu(es->s_reserved_gdt_blocks) : 0; @@ -1381,9 +1380,7 @@ static void ext4_update_super(struct super_block *sb, ext4_debug("free blocks count %llu", percpu_counter_read(&sbi->s_freeclusters_counter)); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_FLEX_BG) && - sbi->s_log_groups_per_flex) { + if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) { ext4_group_t flex_group; flex_group = ext4_flex_group(sbi, group_data[0].group); atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), @@ -1476,8 +1473,7 @@ exit_journal: int gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); - int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_META_BG); + int meta_bg = ext4_has_feature_meta_bg(sb); sector_t old_gdb = 0; update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, @@ -1585,8 +1581,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); - if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) { ext4_warning(sb, "Can't resize non-sparse filesystem further"); return -EPERM; } @@ -1604,9 +1599,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } if (reserved_gdb || gdb_off == 0) { - if (!EXT4_HAS_COMPAT_FEATURE(sb, - EXT4_FEATURE_COMPAT_RESIZE_INODE) - || !le16_to_cpu(es->s_reserved_gdt_blocks)) { + if (ext4_has_feature_resize_inode(sb) || + !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, "No reserved GDT blocks, can't resize"); return -EPERM; @@ -1825,8 +1819,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) if (err) goto errout; - EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + ext4_clear_feature_resize_inode(sb); + ext4_set_feature_meta_bg(sb); sbi->s_es->s_first_meta_bg = cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); @@ -1918,9 +1912,9 @@ retry: n_desc_blocks = num_desc_blocks(sb, n_group + 1); o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count); - meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG); + meta_bg = ext4_has_feature_meta_bg(sb); - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) { + if (ext4_has_feature_resize_inode(sb)) { if (meta_bg) { ext4_error(sb, "resize_inode and meta_bg enabled " "simultaneously"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 49f6c78ee3af..753f4e68b820 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -34,7 +34,6 @@ #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> -#include <linux/proc_fs.h> #include <linux/ctype.h> #include <linux/log2.h> #include <linux/crc16.h> @@ -54,11 +53,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> -static struct proc_dir_entry *ext4_proc_root; -static struct kset *ext4_kset; static struct ext4_lazy_init *ext4_li_info; static struct mutex ext4_li_mtx; -static struct ext4_features *ext4_feat; static int ext4_mballoc_ready; static struct ratelimit_state ext4_mount_msg_ratelimit; @@ -83,7 +79,6 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); -static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t); #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { @@ -115,8 +110,7 @@ MODULE_ALIAS("ext3"); static int ext4_verify_csum_type(struct super_block *sb, struct ext4_super_block *es) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return es->s_checksum_type == EXT4_CRC32C_CHKSUM; @@ -394,9 +388,13 @@ static void ext4_handle_error(struct super_block *sb) smp_wmb(); sb->s_flags |= MS_RDONLY; } - if (test_opt(sb, ERRORS_PANIC)) + if (test_opt(sb, ERRORS_PANIC)) { + if (EXT4_SB(sb)->s_journal && + !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) + return; panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); + } } #define ext4_error_ratelimit(sb) \ @@ -495,6 +493,12 @@ const char *ext4_decode_error(struct super_block *sb, int errno, char *errstr = NULL; switch (errno) { + case -EFSCORRUPTED: + errstr = "Corrupt filesystem"; + break; + case -EFSBADCRC: + errstr = "Filesystem failed CRC"; + break; case -EIO: errstr = "IO failure"; break; @@ -585,8 +589,12 @@ void __ext4_abort(struct super_block *sb, const char *function, jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); } - if (test_opt(sb, ERRORS_PANIC)) + if (test_opt(sb, ERRORS_PANIC)) { + if (EXT4_SB(sb)->s_journal && + !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) + return; panic("EXT4-fs panic from previous error\n"); + } } void __ext4_msg(struct super_block *sb, @@ -800,6 +808,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } + ext4_unregister_sysfs(sb); ext4_es_unregister_shrinker(sbi); del_timer_sync(&sbi->s_err_report); ext4_release_system_zone(sb); @@ -808,18 +817,12 @@ static void ext4_put_super(struct super_block *sb) ext4_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_clear_feature_journal_needs_recovery(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!(sb->s_flags & MS_RDONLY)) ext4_commit_super(sb, 1); - if (sbi->s_proc) { - remove_proc_entry("options", sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } - kobject_del(&sbi->s_kobj); - for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kvfree(sbi->s_group_desc); @@ -1288,7 +1291,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quota options when quota turned on"); return -1; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) { + if (ext4_has_feature_quota(sb)) { ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " "when QUOTA feature is enabled"); return -1; @@ -1381,10 +1384,10 @@ static const struct mount_opts { {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, - MOPT_EXT4_ONLY | MOPT_SET}, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | EXT4_MOUNT_JOURNAL_CHECKSUM), - MOPT_EXT4_ONLY | MOPT_SET}, + MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, @@ -1513,8 +1516,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; if (args->from && (m->flags & MOPT_GTE0) && (arg < 0)) return -1; - if (m->flags & MOPT_EXPLICIT) - set_opt2(sb, EXPLICIT_DELALLOC); + if (m->flags & MOPT_EXPLICIT) { + if (m->mount_opt & EXT4_MOUNT_DELALLOC) { + set_opt2(sb, EXPLICIT_DELALLOC); + } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) { + set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM); + } else + return -1; + } if (m->flags & MOPT_CLEAR_ERR) clear_opt(sb, ERRORS_MASK); if (token == Opt_noquota && sb_any_quota_loaded(sb)) { @@ -1647,8 +1656,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, "quota options when quota turned on"); return -1; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_QUOTA)) { + if (ext4_has_feature_quota(sb)) { ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options " "when QUOTA feature is enabled"); @@ -1707,7 +1715,7 @@ static int parse_options(char *options, struct super_block *sb, return 0; } #ifdef CONFIG_QUOTA - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && + if (ext4_has_feature_quota(sb) && (test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) { ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA " "feature is enabled"); @@ -1880,7 +1888,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root) return _ext4_show_options(seq, root->d_sb, 0); } -static int options_seq_show(struct seq_file *seq, void *offset) +int ext4_seq_options_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; int rc; @@ -1891,19 +1899,6 @@ static int options_seq_show(struct seq_file *seq, void *offset) return rc; } -static int options_open_fs(struct inode *inode, struct file *file) -{ - return single_open(file, options_seq_show, PDE_DATA(inode)); -} - -static const struct file_operations ext4_seq_options_fops = { - .owner = THIS_MODULE, - .open = options_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { @@ -1944,7 +1939,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_mtime = cpu_to_le32(get_seconds()); ext4_update_dynamic_rev(sb); if (sbi->s_journal) - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_set_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); done: @@ -2027,12 +2022,13 @@ failed: return 0; } -static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, +static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { int offset; __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); + struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ @@ -2052,8 +2048,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, } /* old crc16 code */ - if (!(sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM))) + if (!ext4_has_feature_gdt_csum(sb)) return 0; offset = offsetof(struct ext4_group_desc, bg_checksum); @@ -2063,8 +2058,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, crc = crc16(crc, (__u8 *)gdp, offset); offset += sizeof(gdp->bg_checksum); /* skip checksum */ /* for checksum of struct ext4_group_desc do the rest...*/ - if ((sbi->s_es->s_feature_incompat & - cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) && + if (ext4_has_feature_64bit(sb) && offset < le16_to_cpu(sbi->s_es->s_desc_size)) crc = crc16(crc, (__u8 *)gdp + offset, le16_to_cpu(sbi->s_es->s_desc_size) - @@ -2078,8 +2072,7 @@ int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group, struct ext4_group_desc *gdp) { if (ext4_has_group_desc_csum(sb) && - (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb), - block_group, gdp))) + (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp))) return 0; return 1; @@ -2090,7 +2083,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group, { if (!ext4_has_group_desc_csum(sb)) return; - gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp); + gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp); } /* Called at mount-time, super-block is locked */ @@ -2106,7 +2099,7 @@ static int ext4_check_descriptors(struct super_block *sb, int flexbg_flag = 0; ext4_group_t i, grp = sbi->s_groups_count; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (ext4_has_feature_flex_bg(sb)) flexbg_flag = 1; ext4_debug("Checking group descriptors"); @@ -2150,7 +2143,7 @@ static int ext4_check_descriptors(struct super_block *sb, if (!ext4_group_desc_csum_verify(sb, i, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)", - i, le16_to_cpu(ext4_group_desc_csum(sbi, i, + i, le16_to_cpu(ext4_group_desc_csum(sb, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { ext4_unlock_group(sb, i); @@ -2413,8 +2406,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) || - nr < first_meta_bg) + if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg) return logical_sb_block + nr + 1; bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) @@ -2470,335 +2462,6 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) return ret; } -/* sysfs supprt */ - -struct ext4_attr { - struct attribute attr; - ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); - ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, - const char *, size_t); - union { - int offset; - int deprecated_val; - } u; -}; - -static int parse_strtoull(const char *buf, - unsigned long long max, unsigned long long *value) -{ - int ret; - - ret = kstrtoull(skip_spaces(buf), 0, value); - if (!ret && *value > max) - ret = -EINVAL; - return ret; -} - -static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, - char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (s64) EXT4_C2B(sbi, - percpu_counter_sum(&sbi->s_dirtyclusters_counter))); -} - -static ssize_t session_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - return snprintf(buf, PAGE_SIZE, "%lu\n", - (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - sbi->s_sectors_written_start) >> 1); -} - -static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->s_buddy_cache->i_sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1))); -} - -static ssize_t inode_readahead_blks_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned long t; - int ret; - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret) - return ret; - - if (t && (!is_power_of_2(t) || t > 0x40000000)) - return -EINVAL; - - sbi->s_inode_readahead_blks = t; - return count; -} - -static ssize_t sbi_ui_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t sbi_ui_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset); - unsigned long t; - int ret; - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret) - return ret; - *ui = t; - return count; -} - -static ssize_t es_ui_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - - unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) + - a->u.offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t reserved_clusters_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long) atomic64_read(&sbi->s_resv_clusters)); -} - -static ssize_t reserved_clusters_store(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned long long val; - int ret; - - if (parse_strtoull(buf, -1ULL, &val)) - return -EINVAL; - ret = ext4_reserve_clusters(sbi, val); - - return ret ? ret : count; -} - -static ssize_t trigger_test_error(struct ext4_attr *a, - struct ext4_sb_info *sbi, - const char *buf, size_t count) -{ - int len = count; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (len && buf[len-1] == '\n') - len--; - - if (len) - ext4_error(sbi->s_sb, "%.*s", len, buf); - return count; -} - -static ssize_t sbi_deprecated_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val); -} - -#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ -static struct ext4_attr ext4_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .u = { \ - .offset = offsetof(struct ext4_sb_info, _elname),\ - }, \ -} - -#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \ -static struct ext4_attr ext4_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .u = { \ - .offset = offsetof(struct ext4_super_block, _elname), \ - }, \ -} - -#define EXT4_ATTR(name, mode, show, store) \ -static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) - -#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) -#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) -#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) - -#define EXT4_RO_ATTR_ES_UI(name, elname) \ - EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname) -#define EXT4_RW_ATTR_SBI_UI(name, elname) \ - EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) - -#define ATTR_LIST(name) &ext4_attr_##name.attr -#define EXT4_DEPRECATED_ATTR(_name, _val) \ -static struct ext4_attr ext4_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = 0444 }, \ - .show = sbi_deprecated_show, \ - .u = { \ - .deprecated_val = _val, \ - }, \ -} - -EXT4_RO_ATTR(delayed_allocation_blocks); -EXT4_RO_ATTR(session_write_kbytes); -EXT4_RO_ATTR(lifetime_write_kbytes); -EXT4_RW_ATTR(reserved_clusters); -EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, - inode_readahead_blks_store, s_inode_readahead_blks); -EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); -EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); -EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); -EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); -EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); -EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128); -EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); -EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); -EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); -EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); - -static struct attribute *ext4_attrs[] = { - ATTR_LIST(delayed_allocation_blocks), - ATTR_LIST(session_write_kbytes), - ATTR_LIST(lifetime_write_kbytes), - ATTR_LIST(reserved_clusters), - ATTR_LIST(inode_readahead_blks), - ATTR_LIST(inode_goal), - ATTR_LIST(mb_stats), - ATTR_LIST(mb_max_to_scan), - ATTR_LIST(mb_min_to_scan), - ATTR_LIST(mb_order2_req), - ATTR_LIST(mb_stream_req), - ATTR_LIST(mb_group_prealloc), - ATTR_LIST(max_writeback_mb_bump), - ATTR_LIST(extent_max_zeroout_kb), - ATTR_LIST(trigger_fs_error), - ATTR_LIST(err_ratelimit_interval_ms), - ATTR_LIST(err_ratelimit_burst), - ATTR_LIST(warning_ratelimit_interval_ms), - ATTR_LIST(warning_ratelimit_burst), - ATTR_LIST(msg_ratelimit_interval_ms), - ATTR_LIST(msg_ratelimit_burst), - ATTR_LIST(errors_count), - ATTR_LIST(first_error_time), - ATTR_LIST(last_error_time), - NULL, -}; - -/* Features this copy of ext4 supports */ -EXT4_INFO_ATTR(lazy_itable_init); -EXT4_INFO_ATTR(batched_discard); -EXT4_INFO_ATTR(meta_bg_resize); -EXT4_INFO_ATTR(encryption); - -static struct attribute *ext4_feat_attrs[] = { - ATTR_LIST(lazy_itable_init), - ATTR_LIST(batched_discard), - ATTR_LIST(meta_bg_resize), - ATTR_LIST(encryption), - NULL, -}; - -static ssize_t ext4_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void ext4_sb_release(struct kobject *kobj) -{ - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -static const struct sysfs_ops ext4_attr_ops = { - .show = ext4_attr_show, - .store = ext4_attr_store, -}; - -static struct kobj_type ext4_ktype = { - .default_attrs = ext4_attrs, - .sysfs_ops = &ext4_attr_ops, - .release = ext4_sb_release, -}; - -static void ext4_feat_release(struct kobject *kobj) -{ - complete(&ext4_feat->f_kobj_unregister); -} - -static ssize_t ext4_feat_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "supported\n"); -} - -/* - * We can not use ext4_attr_show/store because it relies on the kobject - * being embedded in the ext4_sb_info structure which is definitely not - * true in this case. - */ -static const struct sysfs_ops ext4_feat_ops = { - .show = ext4_feat_show, - .store = NULL, -}; - -static struct kobj_type ext4_feat_ktype = { - .default_attrs = ext4_feat_attrs, - .sysfs_ops = &ext4_feat_ops, - .release = ext4_feat_release, -}; - /* * Check whether this filesystem can be mounted based on * the features present and the RDONLY/RDWR mount requested. @@ -2807,7 +2470,7 @@ static struct kobj_type ext4_feat_ktype = { */ static int ext4_feature_set_ok(struct super_block *sb, int readonly) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { + if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, "Couldn't mount because of " "unsupported optional features (%x)", @@ -2819,14 +2482,14 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) if (readonly) return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) { + if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); sb->s_flags |= MS_RDONLY; return 1; } /* Check that feature set is OK for a read-write mount */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { + if (ext4_has_unknown_ext4_ro_compat_features(sb)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & @@ -2837,7 +2500,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) * Large file size enabled file system can only be mounted * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + if (ext4_has_feature_huge_file(sb)) { if (sizeof(blkcnt_t) < sizeof(u64)) { ext4_msg(sb, KERN_ERR, "Filesystem with huge files " "cannot be mounted RDWR without " @@ -2845,8 +2508,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 0; } } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) && - !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) { ext4_msg(sb, KERN_ERR, "Can't support bigalloc feature without " "extents feature\n"); @@ -2854,8 +2516,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) } #ifndef CONFIG_QUOTA - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && - !readonly) { + if (ext4_has_feature_quota(sb) && !readonly) { ext4_msg(sb, KERN_ERR, "Filesystem with quota feature cannot be mounted RDWR " "without CONFIG_QUOTA"); @@ -3312,7 +2973,7 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_group_t i, ngroups = ext4_get_groups_count(sb); int s, j, count = 0; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) + if (!ext4_has_feature_bigalloc(sb)) return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) + sbi->s_itb_per_group + 2); @@ -3403,10 +3064,10 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } - -static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) +static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; + struct ext4_sb_info *sbi = EXT4_SB(sb); /* * There's no need to reserve anything when we aren't using extents. @@ -3414,8 +3075,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) * hole punching doesn't need new metadata... This is needed especially * to keep ext2/3 backward compatibility. */ - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) - return 0; + if (!ext4_has_feature_extents(sb)) + return; /* * By default we reserve 2% or 4096 clusters, whichever is smaller. * This should cover the situations where we can not afford to run @@ -3424,26 +3085,13 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb) * allocation would require 1, or 2 blocks, higher numbers are * very rare. */ - resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >> - EXT4_SB(sb)->s_cluster_bits; + resv_clusters = (ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits); do_div(resv_clusters, 50); resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096); - return resv_clusters; -} - - -static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count) -{ - ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >> - sbi->s_cluster_bits; - - if (count >= clusters) - return -EINVAL; - - atomic64_set(&sbi->s_resv_clusters, count); - return 0; + atomic64_set(&sbi->s_resv_clusters, resv_clusters); } static int ext4_fill_super(struct super_block *sb, void *data, int silent) @@ -3526,9 +3174,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written); /* Warn if metadata_csum and gdt_csum are both set. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && - EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + if (ext4_has_feature_metadata_csum(sb) && + ext4_has_feature_gdt_csum(sb)) ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); @@ -3541,8 +3188,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Load the checksum driver */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_feature_metadata_csum(sb)) { sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(sbi->s_chksum_driver)) { ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver."); @@ -3557,11 +3203,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with " "invalid superblock checksum. Run e2fsck?"); silent = 1; + ret = -EFSBADCRC; goto cantfind_ext4; } /* Precompute checksum seed for all metadata */ - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_csum_seed(sb)) + sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); + else if (ext4_has_metadata_csum(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); @@ -3664,17 +3313,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && - (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || - EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || - EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) + (ext4_has_compat_features(sb) || + ext4_has_ro_compat_features(sb) || + ext4_has_incompat_features(sb))) ext4_msg(sb, KERN_WARNING, "feature flags set on rev 0 fs, " "running e2fsck is recommended"); if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) { set_opt2(sb, HURD_COMPAT); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_64BIT)) { + if (ext4_has_feature_64bit(sb)) { ext4_msg(sb, KERN_ERR, "The Hurd can't support 64-bit file systems"); goto failed_mount; @@ -3732,8 +3380,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) && - es->s_encryption_level) { + if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", es->s_encryption_level); goto failed_mount; @@ -3765,8 +3412,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + has_huge_files = ext4_has_feature_huge_file(sb); sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); @@ -3790,7 +3436,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_desc_size = le16_to_cpu(es->s_desc_size); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { + if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { @@ -3821,7 +3467,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + if (ext4_has_feature_dir_index(sb)) { i = le32_to_cpu(es->s_flags); if (i & EXT2_FLAGS_UNSIGNED_HASH) sbi->s_hash_unsigned = 3; @@ -3841,8 +3487,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_BIGALLOC); + has_bigalloc = ext4_has_feature_bigalloc(sb); if (has_bigalloc) { if (clustersize < blocksize) { ext4_msg(sb, KERN_ERR, @@ -3961,13 +3606,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (ext4_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); - - if (sbi->s_proc) - proc_create_data("options", S_IRUGO, sbi->s_proc, - &ext4_seq_options_fops, sb); - bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -3982,6 +3620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); + ret = -EFSCORRUPTED; goto failed_mount2; } @@ -4007,7 +3646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) + if (ext4_has_feature_quota(sb)) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; @@ -4021,11 +3660,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || - EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_RECOVER)); + ext4_has_feature_journal_needs_recovery(sb)); - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) && - !(sb->s_flags & MS_RDONLY)) + if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) goto failed_mount3a; @@ -4033,23 +3670,47 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * The first inode we look at is the journal inode. Don't try * root first: it may be modified in the journal! */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { if (ext4_load_journal(sb, es, journal_devnum)) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && - EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + ext4_has_feature_journal_needs_recovery(sb)) { ext4_msg(sb, KERN_ERR, "required journal recovery " "suppressed and not mounted read-only"); goto failed_mount_wq; } else { + /* Nojournal mode, all journal mount options are illegal */ + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_checksum, fs mounted w/o journal"); + goto failed_mount_wq; + } + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit, fs mounted w/o journal"); + goto failed_mount_wq; + } + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "commit=%lu, fs mounted w/o journal", + sbi->s_commit_interval / HZ); + goto failed_mount_wq; + } + if (EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "data=, fs mounted w/o journal"); + goto failed_mount_wq; + } + sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM; + clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); sbi->s_journal = NULL; needs_recovery = 0; goto no_journal; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) && + if (ext4_has_feature_64bit(sb) && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); @@ -4101,18 +3762,16 @@ no_journal: } } - if ((DUMMY_ENCRYPTION_ENABLED(sbi) || - EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) && + if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) && (blocksize != PAGE_CACHE_SIZE)) { ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs encryption"); goto failed_mount_wq; } - if (DUMMY_ENCRYPTION_ENABLED(sbi) && - !(sb->s_flags & MS_RDONLY) && - !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) && + !ext4_has_feature_encrypt(sb)) { + ext4_set_feature_encrypt(sb); ext4_commit_super(sb, 1); } @@ -4171,8 +3830,7 @@ no_journal: if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) { + if (ext4_has_feature_extra_isize(sb)) { if (sbi->s_want_extra_isize < le16_to_cpu(es->s_want_extra_isize)) sbi->s_want_extra_isize = @@ -4192,12 +3850,7 @@ no_journal: "available"); } - err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb)); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " - "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount4a; - } + ext4_set_resv_clusters(sb); err = ext4_setup_system_zone(sb); if (err) { @@ -4236,7 +3889,7 @@ no_journal: goto failed_mount6; } - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) + if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " @@ -4248,17 +3901,13 @@ no_journal: if (err) goto failed_mount6; - sbi->s_kobj.kset = ext4_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL, - "%s", sb->s_id); + err = ext4_register_sysfs(sb); if (err) goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) && - !(sb->s_flags & MS_RDONLY)) { + if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) { err = ext4_enable_quotas(sb); if (err) goto failed_mount8; @@ -4313,7 +3962,7 @@ cantfind_ext4: #ifdef CONFIG_QUOTA failed_mount8: - kobject_del(&sbi->s_kobj); + ext4_unregister_sysfs(sb); #endif failed_mount7: ext4_unregister_li_request(sb); @@ -4353,10 +4002,6 @@ failed_mount2: failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); - if (sbi->s_proc) { - remove_proc_entry("options", sbi->s_proc); - remove_proc_entry(sb->s_id, ext4_proc_root); - } #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); @@ -4403,7 +4048,7 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + BUG_ON(!ext4_has_feature_journal(sb)); /* First, test for the existence of a valid inode on disk. Bad * things happen if we iget() an unused inode, as the subsequent @@ -4453,7 +4098,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, struct ext4_super_block *es; struct block_device *bdev; - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + BUG_ON(!ext4_has_feature_journal(sb)); bdev = ext4_blkdev_get(j_dev, sb); if (bdev == NULL) @@ -4545,7 +4190,7 @@ static int ext4_load_journal(struct super_block *sb, int err = 0; int really_read_only; - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + BUG_ON(!ext4_has_feature_journal(sb)); if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { @@ -4562,7 +4207,7 @@ static int ext4_load_journal(struct super_block *sb, * crash? For recovery, we need to check in advance whether we * can get read-write access to the device. */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + if (ext4_has_feature_journal_needs_recovery(sb)) { if (sb->s_flags & MS_RDONLY) { ext4_msg(sb, KERN_INFO, "INFO: recovery " "required on readonly filesystem"); @@ -4593,7 +4238,7 @@ static int ext4_load_journal(struct super_block *sb, if (!(journal->j_flags & JBD2_BARRIER)) ext4_msg(sb, KERN_INFO, "barriers disabled"); - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) + if (!ext4_has_feature_journal_needs_recovery(sb)) err = jbd2_journal_wipe(journal, !really_read_only); if (!err) { char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); @@ -4707,7 +4352,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, { journal_t *journal = EXT4_SB(sb)->s_journal; - if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + if (!ext4_has_feature_journal(sb)) { BUG_ON(journal != NULL); return; } @@ -4715,9 +4360,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (jbd2_journal_flush(journal) < 0) goto out; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && + if (ext4_has_feature_journal_needs_recovery(sb) && sb->s_flags & MS_RDONLY) { - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_clear_feature_journal_needs_recovery(sb); ext4_commit_super(sb, 1); } @@ -4737,7 +4382,7 @@ static void ext4_clear_journal_err(struct super_block *sb, int j_errno; const char *errstr; - BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + BUG_ON(!ext4_has_feature_journal(sb)); journal = EXT4_SB(sb)->s_journal; @@ -4852,7 +4497,7 @@ static int ext4_freeze(struct super_block *sb) goto out; /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_clear_feature_journal_needs_recovery(sb); } error = ext4_commit_super(sb, 1); @@ -4874,7 +4519,7 @@ static int ext4_unfreeze(struct super_block *sb) if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_set_feature_journal_needs_recovery(sb); } ext4_commit_super(sb, 1); @@ -5027,8 +4672,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_mark_recovery_complete(sb, es); } else { /* Make sure we can mount this feature set readwrite */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_READONLY) || + if (ext4_has_feature_readonly(sb) || !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; @@ -5044,9 +4688,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!ext4_group_desc_csum_verify(sb, g, gdp)) { ext4_msg(sb, KERN_ERR, "ext4_remount: Checksum for group %u failed (%u!=%u)", - g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), + g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)), le16_to_cpu(gdp->bg_checksum)); - err = -EINVAL; + err = -EFSBADCRC; goto restore_opts; } } @@ -5076,8 +4720,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sbi->s_mount_state = le16_to_cpu(es->s_state); if (!ext4_setup_super(sb, es, 0)) sb->s_flags &= ~MS_RDONLY; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_MMP)) + if (ext4_has_feature_mmp(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) { err = -EROFS; @@ -5110,8 +4753,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (enable_quota) { if (sb_any_quota_suspended(sb)) dquot_resume(sb, -1); - else if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_QUOTA)) { + else if (ext4_has_feature_quota(sb)) { err = ext4_enable_quotas(sb); if (err) goto restore_opts; @@ -5255,7 +4897,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot) struct ext4_sb_info *sbi = EXT4_SB(sb); /* Are we journaling quotas? */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) || + if (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) { dquot_mark_dquot_dirty(dquot); return ext4_write_dquot(dquot); @@ -5343,7 +4985,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) }; - BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)); + BUG_ON(!ext4_has_feature_quota(sb)); if (!qf_inums[type]) return -EPERM; @@ -5537,11 +5179,11 @@ static inline void unregister_as_ext2(void) static inline int ext2_feature_set_ok(struct super_block *sb) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP)) + if (ext4_has_unknown_ext2_incompat_features(sb)) return 0; if (sb->s_flags & MS_RDONLY) return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP)) + if (ext4_has_unknown_ext2_ro_compat_features(sb)) return 0; return 1; } @@ -5566,13 +5208,13 @@ static inline void unregister_as_ext3(void) static inline int ext3_feature_set_ok(struct super_block *sb) { - if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP)) + if (ext4_has_unknown_ext3_incompat_features(sb)) return 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + if (!ext4_has_feature_journal(sb)) return 0; if (sb->s_flags & MS_RDONLY) return 1; - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) + if (ext4_has_unknown_ext3_ro_compat_features(sb)) return 0; return 1; } @@ -5586,37 +5228,6 @@ static struct file_system_type ext4_fs_type = { }; MODULE_ALIAS_FS("ext4"); -static int __init ext4_init_feat_adverts(void) -{ - struct ext4_features *ef; - int ret = -ENOMEM; - - ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL); - if (!ef) - goto out; - - ef->f_kobj.kset = ext4_kset; - init_completion(&ef->f_kobj_unregister); - ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL, - "features"); - if (ret) { - kfree(ef); - goto out; - } - - ext4_feat = ef; - ret = 0; -out: - return ret; -} - -static void ext4_exit_feat_adverts(void) -{ - kobject_put(&ext4_feat->f_kobj); - wait_for_completion(&ext4_feat->f_kobj_unregister); - kfree(ext4_feat); -} - /* Shared across all ext4 file systems */ wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; @@ -5643,21 +5254,15 @@ static int __init ext4_init_fs(void) err = ext4_init_pageio(); if (err) - goto out7; + goto out5; err = ext4_init_system_zone(); if (err) - goto out6; - ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); - if (!ext4_kset) { - err = -ENOMEM; - goto out5; - } - ext4_proc_root = proc_mkdir("fs/ext4", NULL); + goto out4; - err = ext4_init_feat_adverts(); + err = ext4_init_sysfs(); if (err) - goto out4; + goto out3; err = ext4_init_mballoc(); if (err) @@ -5682,16 +5287,12 @@ out1: ext4_mballoc_ready = 0; ext4_exit_mballoc(); out2: - ext4_exit_feat_adverts(); -out4: - if (ext4_proc_root) - remove_proc_entry("fs/ext4", NULL); - kset_unregister(ext4_kset); -out5: + ext4_exit_sysfs(); +out3: ext4_exit_system_zone(); -out6: +out4: ext4_exit_pageio(); -out7: +out5: ext4_exit_es(); return err; @@ -5706,9 +5307,7 @@ static void __exit ext4_exit_fs(void) unregister_filesystem(&ext4_fs_type); destroy_inodecache(); ext4_exit_mballoc(); - ext4_exit_feat_adverts(); - remove_proc_entry("fs/ext4", NULL); - kset_unregister(ext4_kset); + ext4_exit_sysfs(); ext4_exit_system_zone(); ext4_exit_pageio(); ext4_exit_es(); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index c677f2c1044b..abe2401ce405 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -57,7 +57,7 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook sizeof(struct ext4_encrypted_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ - res = -EIO; + res = -EFSCORRUPTED; goto errout; } plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ? diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c new file mode 100644 index 000000000000..1b57c72f4a00 --- /dev/null +++ b/fs/ext4/sysfs.c @@ -0,0 +1,448 @@ +/* + * linux/fs/ext4/sysfs.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Theodore Ts'o (tytso@mit.edu) + * + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "ext4.h" +#include "ext4_jbd2.h" + +typedef enum { + attr_noop, + attr_delayed_allocation_blocks, + attr_session_write_kbytes, + attr_lifetime_write_kbytes, + attr_reserved_clusters, + attr_inode_readahead, + attr_trigger_test_error, + attr_feature, + attr_pointer_ui, + attr_pointer_atomic, +} attr_id_t; + +typedef enum { + ptr_explicit, + ptr_ext4_sb_info_offset, + ptr_ext4_super_block_offset, +} attr_ptr_t; + +static const char *proc_dirname = "fs/ext4"; +static struct proc_dir_entry *ext4_proc_root; + +struct ext4_attr { + struct attribute attr; + short attr_id; + short attr_ptr; + union { + int offset; + void *explicit_ptr; + } u; +}; + +static ssize_t session_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%lu\n", + (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + sbi->s_sectors_written_start) >> 1); +} + +static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->s_buddy_cache->i_sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1))); +} + +static ssize_t inode_readahead_blks_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (t && (!is_power_of_2(t) || t > 0x40000000)) + return -EINVAL; + + sbi->s_inode_readahead_blks = t; + return count; +} + +static ssize_t reserved_clusters_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned long long val; + ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >> + sbi->s_cluster_bits); + int ret; + + ret = kstrtoull(skip_spaces(buf), 0, &val); + if (!ret || val >= clusters) + return -EINVAL; + + atomic64_set(&sbi->s_resv_clusters, val); + return count; +} + +static ssize_t trigger_test_error(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t count) +{ + int len = count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (len && buf[len-1] == '\n') + len--; + + if (len) + ext4_error(sbi->s_sb, "%.*s", len, buf); + return count; +} + +#define EXT4_ATTR(_name,_mode,_id) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ +} + +#define EXT4_ATTR_FUNC(_name,_mode) EXT4_ATTR(_name,_mode,_name) + +#define EXT4_ATTR_FEATURE(_name) EXT4_ATTR(_name, 0444, feature) + +#define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .attr_ptr = ptr_##_struct##_offset, \ + .u = { \ + .offset = offsetof(struct _struct, _elname),\ + }, \ +} + +#define EXT4_RO_ATTR_ES_UI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname) + +#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) + +#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .attr_id = attr_##_id, \ + .attr_ptr = ptr_explicit, \ + .u = { \ + .explicit_ptr = _ptr, \ + }, \ +} + +#define ATTR_LIST(name) &ext4_attr_##name.attr + +EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444); +EXT4_ATTR_FUNC(session_write_kbytes, 0444); +EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444); +EXT4_ATTR_FUNC(reserved_clusters, 0644); + +EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, + ext4_sb_info, s_inode_readahead_blks); +EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); +EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); +EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); +EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); +EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); +EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); + +static unsigned int old_bump_val = 128; +EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); + +static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), + ATTR_LIST(session_write_kbytes), + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(reserved_clusters), + ATTR_LIST(inode_readahead_blks), + ATTR_LIST(inode_goal), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), + ATTR_LIST(extent_max_zeroout_kb), + ATTR_LIST(trigger_fs_error), + ATTR_LIST(err_ratelimit_interval_ms), + ATTR_LIST(err_ratelimit_burst), + ATTR_LIST(warning_ratelimit_interval_ms), + ATTR_LIST(warning_ratelimit_burst), + ATTR_LIST(msg_ratelimit_interval_ms), + ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(errors_count), + ATTR_LIST(first_error_time), + ATTR_LIST(last_error_time), + NULL, +}; + +/* Features this copy of ext4 supports */ +EXT4_ATTR_FEATURE(lazy_itable_init); +EXT4_ATTR_FEATURE(batched_discard); +EXT4_ATTR_FEATURE(meta_bg_resize); +EXT4_ATTR_FEATURE(encryption); +EXT4_ATTR_FEATURE(metadata_csum_seed); + +static struct attribute *ext4_feat_attrs[] = { + ATTR_LIST(lazy_itable_init), + ATTR_LIST(batched_discard), + ATTR_LIST(meta_bg_resize), + ATTR_LIST(encryption), + ATTR_LIST(metadata_csum_seed), + NULL, +}; + +static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi) +{ + switch (a->attr_ptr) { + case ptr_explicit: + return a->u.explicit_ptr; + case ptr_ext4_sb_info_offset: + return (void *) (((char *) sbi) + a->u.offset); + case ptr_ext4_super_block_offset: + return (void *) (((char *) sbi->s_es) + a->u.offset); + } + return NULL; +} + +static ssize_t ext4_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + void *ptr = calc_ptr(a, sbi); + + switch (a->attr_id) { + case attr_delayed_allocation_blocks: + return snprintf(buf, PAGE_SIZE, "%llu\n", + (s64) EXT4_C2B(sbi, + percpu_counter_sum(&sbi->s_dirtyclusters_counter))); + case attr_session_write_kbytes: + return session_write_kbytes_show(a, sbi, buf); + case attr_lifetime_write_kbytes: + return lifetime_write_kbytes_show(a, sbi, buf); + case attr_reserved_clusters: + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long) + atomic64_read(&sbi->s_resv_clusters)); + case attr_inode_readahead: + case attr_pointer_ui: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%u\n", + *((unsigned int *) ptr)); + case attr_pointer_atomic: + if (!ptr) + return 0; + return snprintf(buf, PAGE_SIZE, "%d\n", + atomic_read((atomic_t *) ptr)); + case attr_feature: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + + return 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + void *ptr = calc_ptr(a, sbi); + unsigned long t; + int ret; + + switch (a->attr_id) { + case attr_reserved_clusters: + return reserved_clusters_store(a, sbi, buf, len); + case attr_pointer_ui: + if (!ptr) + return 0; + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + *((unsigned int *) ptr) = t; + return len; + case attr_inode_readahead: + return inode_readahead_blks_store(a, sbi, buf, len); + case attr_trigger_test_error: + return trigger_test_error(a, sbi, buf, len); + } + return 0; +} + +static void ext4_sb_release(struct kobject *kobj) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +static const struct sysfs_ops ext4_attr_ops = { + .show = ext4_attr_show, + .store = ext4_attr_store, +}; + +static struct kobj_type ext4_sb_ktype = { + .default_attrs = ext4_attrs, + .sysfs_ops = &ext4_attr_ops, + .release = ext4_sb_release, +}; + +static struct kobj_type ext4_ktype = { + .sysfs_ops = &ext4_attr_ops, +}; + +static struct kset ext4_kset = { + .kobj = {.ktype = &ext4_ktype}, +}; + +static struct kobj_type ext4_feat_ktype = { + .default_attrs = ext4_feat_attrs, + .sysfs_ops = &ext4_attr_ops, +}; + +static struct kobject ext4_feat = { + .kset = &ext4_kset, +}; + +#define PROC_FILE_SHOW_DEFN(name) \ +static int name##_open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \ +} \ +\ +const struct file_operations ext4_seq_##name##_fops = { \ + .owner = THIS_MODULE, \ + .open = name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +} + +#define PROC_FILE_LIST(name) \ + { __stringify(name), &ext4_seq_##name##_fops } + +PROC_FILE_SHOW_DEFN(es_shrinker_info); +PROC_FILE_SHOW_DEFN(options); + +static struct ext4_proc_files { + const char *name; + const struct file_operations *fops; +} proc_files[] = { + PROC_FILE_LIST(options), + PROC_FILE_LIST(es_shrinker_info), + PROC_FILE_LIST(mb_groups), + { NULL, NULL }, +}; + +int ext4_register_sysfs(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_proc_files *p; + int err; + + sbi->s_kobj.kset = &ext4_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + + if (sbi->s_proc) { + for (p = proc_files; p->name; p++) + proc_create_data(p->name, S_IRUGO, sbi->s_proc, + p->fops, sb); + } + return 0; +} + +void ext4_unregister_sysfs(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_proc_files *p; + + if (sbi->s_proc) { + for (p = proc_files; p->name; p++) + remove_proc_entry(p->name, sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } + kobject_del(&sbi->s_kobj); +} + +int __init ext4_init_sysfs(void) +{ + int ret; + + kobject_set_name(&ext4_kset.kobj, "ext4"); + ext4_kset.kobj.parent = fs_kobj; + ret = kset_register(&ext4_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&ext4_feat, &ext4_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&ext4_kset); + else + ext4_proc_root = proc_mkdir(proc_dirname, NULL); + return ret; +} + +void ext4_exit_sysfs(void) +{ + kobject_put(&ext4_feat); + kset_unregister(&ext4_kset); + remove_proc_entry(proc_dirname, NULL); + ext4_proc_root = NULL; +} + diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 16e28c08d1e8..984448c6f5f0 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -195,7 +195,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) - return -EIO; + return -EFSCORRUPTED; e = next; } @@ -205,7 +205,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, (void *)e + sizeof(__u32) || value_start + le16_to_cpu(entry->e_value_offs) + le32_to_cpu(entry->e_value_size) > end)) - return -EIO; + return -EFSCORRUPTED; entry = EXT4_XATTR_NEXT(entry); } @@ -222,9 +222,9 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) - return -EIO; + return -EFSCORRUPTED; if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) - return -EIO; + return -EFSBADCRC; error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, bh->b_data); if (!error) @@ -239,7 +239,7 @@ ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) if (entry->e_value_block != 0 || value_size > size || le16_to_cpu(entry->e_value_offs) + value_size > size) - return -EIO; + return -EFSCORRUPTED; return 0; } @@ -266,7 +266,7 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, } *pentry = entry; if (!cmp && ext4_xattr_check_entry(entry, size)) - return -EIO; + return -EFSCORRUPTED; return cmp ? -ENODATA : 0; } @@ -297,13 +297,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); - error = -EIO; + error = -EFSCORRUPTED; goto cleanup; } ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EIO) + if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; @@ -445,7 +445,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); - error = -EIO; + error = -EFSCORRUPTED; goto cleanup; } ext4_xattr_cache_insert(ext4_mb_cache, bh); @@ -525,12 +525,12 @@ errout: static void ext4_xattr_update_super_block(handle_t *handle, struct super_block *sb) { - if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR)) + if (ext4_has_feature_xattr(sb)) return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { - EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); + ext4_set_feature_xattr(sb); ext4_handle_dirty_super(handle, sb); } } @@ -751,7 +751,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (ext4_xattr_check_block(inode, bs->bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); - error = -EIO; + error = -EFSCORRUPTED; goto cleanup; } /* Find the named attribute. */ @@ -811,7 +811,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, bs->bh); } unlock_buffer(bs->bh); - if (error == -EIO) + if (error == -EFSCORRUPTED) goto bad_block; if (!error) error = ext4_handle_dirty_xattr_block(handle, @@ -855,7 +855,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } error = ext4_xattr_set_entry(i, s); - if (error == -EIO) + if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; @@ -1314,7 +1314,7 @@ retry: if (ext4_xattr_check_block(inode, bh)) { EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); - error = -EIO; + error = -EFSCORRUPTED; goto cleanup; } base = BHDR(bh); @@ -1579,7 +1579,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; if (entry1->e_value_block != 0 || entry2->e_value_block != 0) - return -EIO; + return -EFSCORRUPTED; if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 8c44654ce274..684996c8a3a4 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -427,7 +427,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) struct journal_head *last_jh; struct journal_head *next_jh = jh; int ret; - int freed = 0; if (!jh) return 0; @@ -441,10 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) else ret = __jbd2_journal_remove_checkpoint(jh) + 1; if (!ret) - return freed; + return 0; if (ret == 2) return 1; - freed = 1; /* * This function only frees up some memory * if possible so we dont have an obligation @@ -452,10 +450,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) * requested: */ if (need_resched()) - return freed; + return 0; } while (jh != last_jh); - return freed; + return 0; } /* diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 362e5f614450..36345fefa3ff 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -142,8 +142,7 @@ static int journal_submit_commit_record(journal_t *journal, tmp->h_commit_sec = cpu_to_be64(now.tv_sec); tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); - if (JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_has_feature_checksum(journal)) { tmp->h_chksum_type = JBD2_CRC32_CHKSUM; tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE; tmp->h_chksum[0] = cpu_to_be32(crc32_sum); @@ -157,8 +156,7 @@ static int journal_submit_commit_record(journal_t *journal, bh->b_end_io = journal_end_buffer_io_sync; if (journal->j_flags & JBD2_BARRIER && - !JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) + !jbd2_has_feature_async_commit(journal)) ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh); else ret = submit_bh(WRITE_SYNC, bh); @@ -317,7 +315,7 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag, unsigned long long block) { tag->t_blocknr = cpu_to_be32(block & (u32)~0); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(j)) tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); } @@ -356,7 +354,7 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, bh->b_size); kunmap_atomic(addr); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(j)) tag3->t_checksum = cpu_to_be32(csum32); else tag->t_checksum = cpu_to_be16(csum32); @@ -730,8 +728,7 @@ start_journal_io: /* * Compute checksum. */ - if (JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_has_feature_checksum(journal)) { crc32_sum = jbd2_checksum_data(crc32_sum, bh); } @@ -797,8 +794,7 @@ start_journal_io: blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); /* Done it all: now write the commit record asynchronously. */ - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (jbd2_has_feature_async_commit(journal)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) @@ -889,8 +885,7 @@ start_journal_io: commit_transaction->t_state = T_COMMIT_JFLUSH; write_unlock(&journal->j_state_lock); - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (!jbd2_has_feature_async_commit(journal)) { err = journal_submit_commit_record(journal, commit_transaction, &cbh, crc32_sum); if (err) @@ -898,8 +893,7 @@ start_journal_io: } if (cbh) err = journal_wait_on_commit_record(journal, cbh); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && + if (jbd2_has_feature_async_commit(journal) && journal->j_flags & JBD2_BARRIER) { blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8270fe9e3641..81e622681c82 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug); /* Checksumming functions */ static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { - if (!jbd2_journal_has_csum_v2or3(j)) + if (!jbd2_journal_has_csum_v2or3_feature(j)) return 1; return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; @@ -1523,16 +1523,16 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && - JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + if (jbd2_has_feature_csum2(journal) && + jbd2_has_feature_csum3(journal)) { /* Can't have checksum v2 and v3 at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 " "at the same time!\n"); goto out; } - if (jbd2_journal_has_csum_v2or3(journal) && - JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { + if (jbd2_journal_has_csum_v2or3_feature(journal) && + jbd2_has_feature_checksum(journal)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " "at the same time!\n"); @@ -1545,7 +1545,7 @@ static int journal_get_superblock(journal_t *journal) } /* Load the checksum driver */ - if (jbd2_journal_has_csum_v2or3(journal)) { + if (jbd2_journal_has_csum_v2or3_feature(journal)) { journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); if (IS_ERR(journal->j_chksum_driver)) { printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); @@ -1558,6 +1558,7 @@ static int journal_get_superblock(journal_t *journal) /* Check superblock checksum */ if (!jbd2_superblock_csum_verify(journal, sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; goto out; } @@ -1649,7 +1650,7 @@ int jbd2_journal_load(journal_t *journal) printk(KERN_ERR "JBD2: journal transaction %u on %s " "is corrupt.\n", journal->j_failed_commit, journal->j_devname); - return -EIO; + return -EFSCORRUPTED; } /* OK, we've finished with the dynamic journal bits: @@ -2071,8 +2072,12 @@ static void __journal_abort_soft (journal_t *journal, int errno) __jbd2_journal_abort_hard(journal); - if (errno) + if (errno) { jbd2_journal_update_sb_errno(journal); + write_lock(&journal->j_state_lock); + journal->j_flags |= JBD2_REC_ERR; + write_unlock(&journal->j_state_lock); + } } /** @@ -2197,15 +2202,15 @@ size_t journal_tag_bytes(journal_t *journal) { size_t sz; - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(journal)) return sizeof(journal_block_tag3_t); sz = sizeof(journal_block_tag_t); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) + if (jbd2_has_feature_csum2(journal)) sz += sizeof(__u16); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) return sz; else return sz - sizeof(__u32); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index a9079d035ae5..7f277e49fe88 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -140,7 +140,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal, if (offset >= journal->j_maxlen) { printk(KERN_ERR "JBD2: corrupted journal superblock\n"); - return -EIO; + return -EFSCORRUPTED; } err = jbd2_journal_bmap(journal, offset, &blocknr); @@ -342,7 +342,7 @@ static inline unsigned long long read_tag_block(journal_t *journal, journal_block_tag_t *tag) { unsigned long long block = be32_to_cpu(tag->t_blocknr); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; return block; } @@ -411,7 +411,7 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); - if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3)) + if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); else return tag->t_checksum == cpu_to_be16(csum32); @@ -527,7 +527,7 @@ static int do_one_pass(journal_t *journal, printk(KERN_ERR "JBD2: Invalid checksum " "recovering block %lu in log\n", next_log_block); - err = -EIO; + err = -EFSBADCRC; brelse(bh); goto failed; } @@ -538,8 +538,7 @@ static int do_one_pass(journal_t *journal, * just skip over the blocks it describes. */ if (pass != PASS_REPLAY) { if (pass == PASS_SCAN && - JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM) && + jbd2_has_feature_checksum(journal) && !info->end_transaction) { if (calc_chksums(journal, bh, &next_log_block, @@ -602,7 +601,7 @@ static int do_one_pass(journal_t *journal, journal, tag, obh->b_data, be32_to_cpu(tmp->h_sequence))) { brelse(obh); - success = -EIO; + success = -EFSBADCRC; printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "block %llu in log\n", @@ -694,8 +693,7 @@ static int do_one_pass(journal_t *journal, * much to do other than move on to the next sequence * number. */ if (pass == PASS_SCAN && - JBD2_HAS_COMPAT_FEATURE(journal, - JBD2_FEATURE_COMPAT_CHECKSUM)) { + jbd2_has_feature_checksum(journal)) { int chksum_err, chksum_seen; struct commit_header *cbh = (struct commit_header *)bh->b_data; @@ -735,8 +733,7 @@ static int do_one_pass(journal_t *journal, if (chksum_err) { info->end_transaction = next_commit_ID; - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){ + if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = next_commit_ID; brelse(bh); @@ -750,8 +747,7 @@ static int do_one_pass(journal_t *journal, bh->b_data)) { info->end_transaction = next_commit_ID; - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + if (!jbd2_has_feature_async_commit(journal)) { journal->j_failed_commit = next_commit_ID; brelse(bh); @@ -851,7 +847,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, rcount = be32_to_cpu(header->r_count); if (!jbd2_revoke_block_csum_verify(journal, header)) - return -EINVAL; + return -EFSBADCRC; if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); @@ -859,7 +855,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, return -EINVAL; max = rcount; - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) record_len = 8; while (offset + record_len <= max) { diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 0abf2e7f725b..705ae577882b 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -589,7 +589,7 @@ static void write_one_revoke_record(journal_t *journal, if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; @@ -619,7 +619,7 @@ static void write_one_revoke_record(journal_t *journal, *descriptorp = descriptor; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 81180022923f..d211b8e18566 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -621,9 +621,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode uint32_t alloclen; int ret; - if (!new_valid_dev(rdev)) - return -EINVAL; - ri = jffs2_alloc_raw_inode(); if (!ri) return -ENOMEM; diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index b8fd651307a4..ce1189793288 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -97,25 +97,16 @@ int __init jffs2_create_slab_caches(void) void jffs2_destroy_slab_caches(void) { - if(full_dnode_slab) - kmem_cache_destroy(full_dnode_slab); - if(raw_dirent_slab) - kmem_cache_destroy(raw_dirent_slab); - if(raw_inode_slab) - kmem_cache_destroy(raw_inode_slab); - if(tmp_dnode_info_slab) - kmem_cache_destroy(tmp_dnode_info_slab); - if(raw_node_ref_slab) - kmem_cache_destroy(raw_node_ref_slab); - if(node_frag_slab) - kmem_cache_destroy(node_frag_slab); - if(inode_cache_slab) - kmem_cache_destroy(inode_cache_slab); + kmem_cache_destroy(full_dnode_slab); + kmem_cache_destroy(raw_dirent_slab); + kmem_cache_destroy(raw_inode_slab); + kmem_cache_destroy(tmp_dnode_info_slab); + kmem_cache_destroy(raw_node_ref_slab); + kmem_cache_destroy(node_frag_slab); + kmem_cache_destroy(inode_cache_slab); #ifdef CONFIG_JFFS2_FS_XATTR - if (xattr_datum_cache) - kmem_cache_destroy(xattr_datum_cache); - if (xattr_ref_cache) - kmem_cache_destroy(xattr_ref_cache); + kmem_cache_destroy(xattr_datum_cache); + kmem_cache_destroy(xattr_ref_cache); #endif } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 28e0aab42bc3..bfebbf13698c 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -660,8 +660,12 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r err = jffs2_flash_read(c, (ref_offset(ref)) + read, rd->nsize - already, &read, &fd->name[already]); - if (unlikely(read != rd->nsize - already) && likely(!err)) + if (unlikely(read != rd->nsize - already) && likely(!err)) { + jffs2_free_full_dirent(fd); + JFFS2_ERROR("short read: wanted %d bytes, got %zd\n", + rd->nsize - already, read); return -EIO; + } if (unlikely(err)) { JFFS2_ERROR("read remainder of name: error %d\n", err); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 955da626ba6b..f3a4857ff071 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1274,7 +1274,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { #ifdef CONFIG_JFFS2_FS_WBUF_VERIFY c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); if (!c->wbuf_verify) { - kfree(c->oobbuf); kfree(c->wbuf); return -ENOMEM; } diff --git a/fs/namei.c b/fs/namei.c index 3c18970a8899..d84d7c7515fc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1966,7 +1966,7 @@ OK: if (err) { const char *s = get_link(nd); - if (unlikely(IS_ERR(s))) + if (IS_ERR(s)) return PTR_ERR(s); err = 0; if (unlikely(!s)) { @@ -3380,7 +3380,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, return ERR_PTR(-ELOOP); filename = getname_kernel(name); - if (unlikely(IS_ERR(filename))) + if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 93575e91a7aa..356816e7bc90 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -597,7 +597,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, qname.name = __name; newdent = d_hash_and_lookup(dentry, &qname); - if (unlikely(IS_ERR(newdent))) + if (IS_ERR(newdent)) goto end_advance; if (!newdent) { newdent = d_alloc(dentry, &qname); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 5aaed363556a..5c0c6b58157f 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -124,7 +124,7 @@ objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, retry_lookup: od = osduld_info_lookup(&odi); - if (unlikely(IS_ERR(od))) { + if (IS_ERR(od)) { err = PTR_ERR(od); dprintk("%s: osduld_info_lookup => %d\n", __func__, err); if (err == -ENODEV && retry_flag) { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index fdda62e6115e..fe5b6e6c4671 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -948,7 +948,7 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir, found: subdir->header.nreg++; failed: - if (unlikely(IS_ERR(subdir))) { + if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); pr_cont("/%*.*s %ld\n", diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index cbc8d5d2755a..c66f2423e1f5 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -340,8 +340,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) dput(dentry); dentry = ERR_PTR(-EEXIST); } - if (IS_ERR(dentry)) + + if (IS_ERR(dentry)) { mutex_unlock(&parent->d_inode->i_mutex); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + } + return dentry; } |