diff options
Diffstat (limited to 'fs')
77 files changed, 962 insertions, 758 deletions
diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 9ad2369d9e35..bfcb18feb1df 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -231,9 +231,6 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, static int bad_inode_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return -EIO; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 63039ed9576f..2bc5dc644b4c 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1864,6 +1864,7 @@ cleanup: kfree(psinfo); kfree(notes); kfree(fpu); + kfree(shdr4extnum); #ifdef ELF_CORE_COPY_XFPREGS kfree(xfpu); #endif diff --git a/fs/block_dev.c b/fs/block_dev.c index 1a2421f908f0..610e8e0b04b8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -762,7 +762,19 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, if (!disk) return ERR_PTR(-ENXIO); - whole = bdget_disk(disk, 0); + /* + * Normally, @bdev should equal what's returned from bdget_disk() + * if partno is 0; however, some drivers (floppy) use multiple + * bdev's for the same physical device and @bdev may be one of the + * aliases. Keep @bdev if partno is 0. This means claimer + * tracking is broken for those devices but it has always been that + * way. + */ + if (partno) + whole = bdget_disk(disk, 0); + else + whole = bdgrab(bdev); + module_put(disk->fops->owner); put_disk(disk); if (!whole) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 378b5b4443f3..3b859a3e6a0e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -19,7 +19,6 @@ #ifndef __BTRFS_CTREE__ #define __BTRFS_CTREE__ -#include <linux/version.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/fs.h> @@ -967,6 +966,12 @@ struct btrfs_fs_info { struct srcu_struct subvol_srcu; spinlock_t trans_lock; + /* + * the reloc mutex goes with the trans lock, it is taken + * during commit to protect us from the relocation code + */ + struct mutex reloc_mutex; + struct list_head trans_list; struct list_head hashers; struct list_head dead_roots; @@ -1172,6 +1177,14 @@ struct btrfs_root { u32 type; u64 highest_objectid; + + /* btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So in_trans_setup + * is used to tell us when more checks are required + */ + unsigned long in_trans_setup; int ref_cows; int track_dirty; int in_radix; @@ -1181,7 +1194,6 @@ struct btrfs_root { struct btrfs_key defrag_max; int defrag_running; char *name; - int in_sysfs; /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; @@ -1323,6 +1335,11 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_STRING_ITEM_KEY 253 +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ #define BTRFS_MOUNT_NODATASUM (1 << 0) #define BTRFS_MOUNT_NODATACOW (1 << 1) #define BTRFS_MOUNT_NOBARRIER (1 << 2) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6462c29d2d37..98c68e658a9b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -82,19 +82,16 @@ static inline struct btrfs_delayed_root *btrfs_get_delayed_root( return root->fs_info->delayed_root; } -static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( - struct inode *inode) +static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) { - struct btrfs_delayed_node *node; struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(inode); - int ret; + struct btrfs_delayed_node *node; -again: node = ACCESS_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); /* can be accessed */ + atomic_inc(&node->refs); return node; } @@ -102,8 +99,10 @@ again: node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { + atomic_inc(&node->refs); /* can be accessed */ + BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); - goto again; + return node; } btrfs_inode->delayed_node = node; atomic_inc(&node->refs); /* can be accessed */ @@ -113,6 +112,23 @@ again: } spin_unlock(&root->inode_lock); + return NULL; +} + +static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( + struct inode *inode) +{ + struct btrfs_delayed_node *node; + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); + struct btrfs_root *root = btrfs_inode->root; + u64 ino = btrfs_ino(inode); + int ret; + +again: + node = btrfs_get_delayed_node(inode); + if (node) + return node; + node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); if (!node) return ERR_PTR(-ENOMEM); @@ -297,7 +313,6 @@ struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->data_len = data_len; item->ins_or_del = 0; item->bytes_reserved = 0; - item->block_rsv = NULL; item->delayed_node = NULL; atomic_set(&item->refs, 1); } @@ -549,19 +564,6 @@ struct btrfs_delayed_item *__btrfs_next_delayed_item( return next; } -static inline struct btrfs_delayed_node *btrfs_get_delayed_node( - struct inode *inode) -{ - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); - struct btrfs_delayed_node *delayed_node; - - delayed_node = btrfs_inode->delayed_node; - if (delayed_node) - atomic_inc(&delayed_node->refs); - - return delayed_node; -} - static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root, u64 root_id) { @@ -593,10 +595,8 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) { + if (!ret) item->bytes_reserved = num_bytes; - item->block_rsv = dst_rsv; - } return ret; } @@ -604,10 +604,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, struct btrfs_delayed_item *item) { + struct btrfs_block_rsv *rsv; + if (!item->bytes_reserved) return; - btrfs_block_rsv_release(root, item->block_rsv, + rsv = &root->fs_info->global_block_rsv; + btrfs_block_rsv_release(root, rsv, item->bytes_reserved); } @@ -1014,6 +1017,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret = 0; path = btrfs_alloc_path(); @@ -1021,6 +1025,9 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + delayed_root = btrfs_get_delayed_root(root); curr_node = btrfs_first_delayed_node(delayed_root); @@ -1045,6 +1052,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, } btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1052,6 +1060,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_path *path; + struct btrfs_block_rsv *block_rsv; int ret; path = btrfs_alloc_path(); @@ -1059,6 +1068,9 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, return -ENOMEM; path->leave_spinning = 1; + block_rsv = trans->block_rsv; + trans->block_rsv = &node->root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, node->root, node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, node->root, node); @@ -1066,6 +1078,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = btrfs_update_delayed_inode(trans, node->root, path, node); btrfs_free_path(path); + trans->block_rsv = block_rsv; return ret; } @@ -1116,6 +1129,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) struct btrfs_path *path; struct btrfs_delayed_node *delayed_node = NULL; struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; unsigned long nr = 0; int need_requeue = 0; int ret; @@ -1134,6 +1148,9 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) if (IS_ERR(trans)) goto free_path; + block_rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); if (!ret) ret = btrfs_delete_delayed_items(trans, path, root, @@ -1176,6 +1193,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) nr = trans->blocks_used; + trans->block_rsv = block_rsv; btrfs_end_transaction_dmeta(trans, root); __btrfs_btree_balance_dirty(root, nr); free_path: @@ -1222,6 +1240,13 @@ again: return 0; } +void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +{ + struct btrfs_delayed_root *delayed_root; + delayed_root = btrfs_get_delayed_root(root); + WARN_ON(btrfs_first_delayed_node(delayed_root)); +} + void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; @@ -1382,8 +1407,7 @@ end: int btrfs_inode_delayed_dir_index_count(struct inode *inode) { - struct btrfs_delayed_node *delayed_node = BTRFS_I(inode)->delayed_node; - int ret = 0; + struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1393,11 +1417,14 @@ int btrfs_inode_delayed_dir_index_count(struct inode *inode) * a new directory index is added into the delayed node and index_cnt * is updated now. So we needn't lock the delayed node. */ - if (!delayed_node->index_cnt) + if (!delayed_node->index_cnt) { + btrfs_release_delayed_node(delayed_node); return -EINVAL; + } BTRFS_I(inode)->index_cnt = delayed_node->index_cnt; - return ret; + btrfs_release_delayed_node(delayed_node); + return 0; } void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list, @@ -1591,6 +1618,57 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode->i_ctime.tv_nsec); } +int btrfs_fill_inode(struct inode *inode, u32 *rdev) +{ + struct btrfs_delayed_node *delayed_node; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + + delayed_node = btrfs_get_delayed_node(inode); + if (!delayed_node) + return -ENOENT; + + mutex_lock(&delayed_node->mutex); + if (!delayed_node->inode_dirty) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return -ENOENT; + } + + inode_item = &delayed_node->inode_item; + + inode->i_uid = btrfs_stack_inode_uid(inode_item); + inode->i_gid = btrfs_stack_inode_gid(inode_item); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + inode->i_mode = btrfs_stack_inode_mode(inode_item); + inode->i_nlink = btrfs_stack_inode_nlink(inode_item); + inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); + BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); + inode->i_rdev = 0; + *rdev = btrfs_stack_inode_rdev(inode_item); + BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec); + inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec); + + inode->i_generation = BTRFS_I(inode)->generation; + BTRFS_I(inode)->index_cnt = (u64)-1; + + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_node(delayed_node); + return 0; +} + int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index eb7d240aa648..8d27af4bd8b9 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -75,7 +75,6 @@ struct btrfs_delayed_item { struct list_head tree_list; /* used for batch insert/delete items */ struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; - struct btrfs_block_rsv *block_rsv; struct btrfs_delayed_node *delayed_node; atomic_t refs; int ins_or_del; @@ -120,6 +119,7 @@ void btrfs_kill_delayed_inode_items(struct inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_fill_inode(struct inode *inode, u32 *rdev); /* Used for drop dead root */ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); @@ -138,4 +138,8 @@ int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent, /* for init */ int __init btrfs_delayed_inode_init(void); void btrfs_delayed_inode_exit(void); + +/* for debugging */ +void btrfs_assert_delayed_root_empty(struct btrfs_root *root); + #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9f68c6898653..1ac8db5dc0a3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1044,7 +1044,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_trans = 0; root->highest_objectid = 0; root->name = NULL; - root->in_sysfs = 0; root->inode_tree = RB_ROOT; INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); root->block_rsv = NULL; @@ -1300,19 +1299,21 @@ again: return root; root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS); - if (!root->free_ino_ctl) - goto fail; root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned), GFP_NOFS); - if (!root->free_ino_pinned) + if (!root->free_ino_pinned || !root->free_ino_ctl) { + ret = -ENOMEM; goto fail; + } btrfs_init_free_ino_ctl(root); mutex_init(&root->fs_commit_mutex); spin_lock_init(&root->cache_lock); init_waitqueue_head(&root->cache_wait); - set_anon_super(&root->anon_super, NULL); + ret = set_anon_super(&root->anon_super, NULL); + if (ret) + goto fail; if (btrfs_root_refs(&root->root_item) == 0) { ret = -ENOENT; @@ -1618,6 +1619,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); + mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b42efc2ded51..71cd456fdb60 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3314,10 +3314,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, if (reserved == 0) return 0; - /* nothing to shrink - nothing to reclaim */ - if (root->fs_info->delalloc_bytes == 0) - return 0; - max_reclaim = min(reserved, to_reclaim); while (loops < 1024) { @@ -4846,7 +4842,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, u64 num_bytes, u64 empty_size, u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - int data) + u64 data) { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; @@ -4873,7 +4869,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, space_info = __find_space_info(root->fs_info, data); if (!space_info) { - printk(KERN_ERR "No space info for %d\n", data); + printk(KERN_ERR "No space info for %llu\n", data); return -ENOSPC; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9f985a429877..bf0d61567f3d 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1893,9 +1893,12 @@ void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl) while ((node = rb_last(&ctl->free_space_offset)) != NULL) { info = rb_entry(node, struct btrfs_free_space, offset_index); - unlink_free_space(ctl, info); - kfree(info->bitmap); - kmem_cache_free(btrfs_free_space_cachep, info); + if (!info->bitmap) { + unlink_free_space(ctl, info); + kmem_cache_free(btrfs_free_space_cachep, info); + } else { + free_bitmap(ctl, info); + } if (need_resched()) { spin_unlock(&ctl->tree_lock); cond_resched(); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 751ddf8fc58a..3601f0aebddf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2509,6 +2509,11 @@ static void btrfs_read_locked_inode(struct inode *inode) int maybe_acls; u32 rdev; int ret; + bool filled = false; + + ret = btrfs_fill_inode(inode, &rdev); + if (!ret) + filled = true; path = btrfs_alloc_path(); BUG_ON(!path); @@ -2520,6 +2525,10 @@ static void btrfs_read_locked_inode(struct inode *inode) goto make_bad; leaf = path->nodes[0]; + + if (filled) + goto cache_acl; + inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); if (!leaf->map_token) @@ -2556,7 +2565,7 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->index_cnt = (u64)-1; BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - +cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls @@ -2572,7 +2581,6 @@ static void btrfs_read_locked_inode(struct inode *inode) } btrfs_free_path(path); - inode_item = NULL; switch (inode->i_mode & S_IFMT) { case S_IFREG: @@ -2670,12 +2678,14 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, int ret; /* - * If root is tree root, it means this inode is used to - * store free space information. And these inodes are updated - * when committing the transaction, so they needn't delaye to - * be updated, or deadlock will occured. + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay */ - if (!is_free_space_inode(root, inode)) { + if (!is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_delayed_update_inode(trans, root, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); @@ -3076,6 +3086,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); + btrfs_free_path(path); return 0; } @@ -4519,6 +4530,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_tree_add(inode); trace_btrfs_inode_new(inode); + btrfs_set_inode_last_trans(trans, inode); return inode; fail: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b793d112d1f6..a3c4751e07db 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -482,8 +482,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); BUG_ON(ret); + spin_lock(&root->fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); + spin_unlock(&root->fs_info->trans_lock); if (async_transid) { *async_transid = trans->transid; ret = btrfs_commit_transaction_async(trans, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b1ef27cc673b..5e0a3dc79a45 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1368,7 +1368,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!root->reloc_root) - return 0; + goto out; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -1390,6 +1390,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, root->fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); + +out: return 0; } @@ -2142,10 +2144,11 @@ int prepare_to_merge(struct reloc_control *rc, int err) u64 num_bytes = 0; int ret; - spin_lock(&root->fs_info->trans_lock); + mutex_lock(&root->fs_info->reloc_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); + again: if (!err) { num_bytes = rc->merging_rsv_size; @@ -2214,9 +2217,16 @@ int merge_reloc_roots(struct reloc_control *rc) int ret; again: root = rc->extent_root; - spin_lock(&root->fs_info->trans_lock); + + /* + * this serializes us with btrfs_record_root_in_transaction, + * we have to make sure nobody is in the middle of + * adding their roots to the list while we are + * doing this splice + */ + mutex_lock(&root->fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); - spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); while (!list_empty(&reloc_roots)) { found = 1; @@ -3590,17 +3600,19 @@ next: static void set_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = rc; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static void unset_reloc_control(struct reloc_control *rc) { struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - spin_lock(&fs_info->trans_lock); + + mutex_lock(&fs_info->reloc_mutex); fs_info->reloc_ctl = NULL; - spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); } static int check_extent_flags(u64 flags) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0bb4ebbb71b7..15634d4648d7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -723,6 +723,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) seq_puts(seq, ",user_subvol_rm_allowed"); + if (btrfs_test_opt(root, ENOSPC_DEBUG)) + seq_puts(seq, ",enospc_debug"); + if (btrfs_test_opt(root, AUTO_DEFRAG)) + seq_puts(seq, ",autodefrag"); + if (btrfs_test_opt(root, INODE_MAP_CACHE)) + seq_puts(seq, ",inode_cache"); return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c3c223ae6691..daac9ae6d731 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -28,152 +28,6 @@ #include "disk-io.h" #include "transaction.h" -static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_used(&root->root_item)); -} - -static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_root_limit(&root->root_item)); -} - -static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) -{ - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); -} - -static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); -} - -static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); -} - -/* this is for root attrs (subvols/snapshots) */ -struct btrfs_root_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_root *, char *); - ssize_t (*store)(struct btrfs_root *, const char *, size_t); -}; - -#define ROOT_ATTR(name, mode, show, store) \ -static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ - show, store) - -ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); -ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); - -static struct attribute *btrfs_root_attrs[] = { - &btrfs_root_attr_blocks_used.attr, - &btrfs_root_attr_block_limit.attr, - NULL, -}; - -/* this is for super attrs (actual full fs) */ -struct btrfs_super_attr { - struct attribute attr; - ssize_t (*show)(struct btrfs_fs_info *, char *); - ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); -}; - -#define SUPER_ATTR(name, mode, show, store) \ -static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ - show, store) - -SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); -SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); -SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); - -static struct attribute *btrfs_super_attrs[] = { - &btrfs_super_attr_blocks_used.attr, - &btrfs_super_attr_total_blocks.attr, - &btrfs_super_attr_blocksize.attr, - NULL, -}; - -static ssize_t btrfs_super_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->show ? a->show(fs, buf) : 0; -} - -static ssize_t btrfs_super_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - struct btrfs_super_attr *a = container_of(attr, - struct btrfs_super_attr, - attr); - - return a->store ? a->store(fs, buf, len) : 0; -} - -static ssize_t btrfs_root_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - - return a->show ? a->show(root, buf) : 0; -} - -static ssize_t btrfs_root_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - struct btrfs_root_attr *a = container_of(attr, - struct btrfs_root_attr, - attr); - return a->store ? a->store(root, buf, len) : 0; -} - -static void btrfs_super_release(struct kobject *kobj) -{ - struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, - super_kobj); - complete(&fs->kobj_unregister); -} - -static void btrfs_root_release(struct kobject *kobj) -{ - struct btrfs_root *root = container_of(kobj, struct btrfs_root, - root_kobj); - complete(&root->kobj_unregister); -} - -static const struct sysfs_ops btrfs_super_attr_ops = { - .show = btrfs_super_attr_show, - .store = btrfs_super_attr_store, -}; - -static const struct sysfs_ops btrfs_root_attr_ops = { - .show = btrfs_root_attr_show, - .store = btrfs_root_attr_store, -}; - /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2b3590b9fe98..51dcec86757f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -126,28 +126,85 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, +static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); WARN_ON(root->commit_root != root->node); + /* + * see below for in_trans_setup usage rules + * we have the reloc mutex held now, so there + * is only one writer in this function + */ + root->in_trans_setup = 1; + + /* make sure readers find in_trans_setup before + * they find our root->last_trans update + */ + smp_wmb(); + spin_lock(&root->fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid) { spin_unlock(&root->fs_info->fs_roots_radix_lock); return 0; } - root->last_trans = trans->transid; radix_tree_tag_set(&root->fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); spin_unlock(&root->fs_info->fs_roots_radix_lock); + root->last_trans = trans->transid; + + /* this is pretty tricky. We don't want to + * take the relocation lock in btrfs_record_root_in_trans + * unless we're really doing the first setup for this root in + * this transaction. + * + * Normally we'd use root->last_trans as a flag to decide + * if we want to take the expensive mutex. + * + * But, we have to set root->last_trans before we + * init the relocation root, otherwise, we trip over warnings + * in ctree.c. The solution used here is to flag ourselves + * with root->in_trans_setup. When this is 1, we're still + * fixing up the reloc trees and everyone must wait. + * + * When this is zero, they can trust root->last_trans and fly + * through btrfs_record_root_in_trans without having to take the + * lock. smp_wmb() makes sure that all the writes above are + * done before we pop in the zero below + */ btrfs_init_reloc_root(trans, root); + smp_wmb(); + root->in_trans_setup = 0; } return 0; } + +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; + + /* + * see record_root_in_trans for comments about in_trans_setup usage + * and barriers + */ + smp_rmb(); + if (root->last_trans == trans->transid && + !root->in_trans_setup) + return 0; + + mutex_lock(&root->fs_info->reloc_mutex); + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->reloc_mutex); + + return 0; +} + /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. @@ -882,7 +939,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent = dget_parent(dentry); parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; - btrfs_record_root_in_trans(trans, parent_root); + record_root_in_trans(trans, parent_root); /* * insert the directory item @@ -900,7 +957,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); - btrfs_record_root_in_trans(trans, root); + /* + * pull in the delayed directory update + * and the delayed inode item + * otherwise we corrupt the FS during + * snapshot + */ + ret = btrfs_run_delayed_items(trans, root); + BUG_ON(ret); + + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -961,14 +1027,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, int ret; list_for_each_entry(pending, head, list) { - /* - * We must deal with the delayed items before creating - * snapshots, or we will create a snapthot with inconsistent - * information. - */ - ret = btrfs_run_delayed_items(trans, fs_info->fs_root); - BUG_ON(ret); - ret = create_pending_snapshot(trans, fs_info, pending); BUG_ON(ret); } @@ -1241,21 +1299,42 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, schedule_timeout(1); finish_wait(&cur_trans->writer_wait, &wait); - spin_lock(&root->fs_info->trans_lock); - root->fs_info->trans_no_join = 1; - spin_unlock(&root->fs_info->trans_lock); } while (atomic_read(&cur_trans->num_writers) > 1 || (should_grow && cur_trans->num_joined != joined)); - ret = create_pending_snapshots(trans, root->fs_info); - BUG_ON(ret); + /* + * Ok now we need to make sure to block out any other joins while we + * commit the transaction. We could have started a join before setting + * no_join so make sure to wait for num_writers to == 1 again. + */ + spin_lock(&root->fs_info->trans_lock); + root->fs_info->trans_no_join = 1; + spin_unlock(&root->fs_info->trans_lock); + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + + /* + * the reloc mutex makes sure that we stop + * the balancing code from coming in and moving + * extents around in the middle of the commit + */ + mutex_lock(&root->fs_info->reloc_mutex); ret = btrfs_run_delayed_items(trans, root); BUG_ON(ret); + ret = create_pending_snapshots(trans, root->fs_info); + BUG_ON(ret); + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); + /* + * make sure none of the code above managed to slip in a + * delayed item + */ + btrfs_assert_delayed_root_empty(root); + WARN_ON(cur_trans != trans->transaction); btrfs_scrub_pause(root); @@ -1312,6 +1391,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, root->fs_info->running_transaction = NULL; root->fs_info->trans_no_join = 0; spin_unlock(&root->fs_info->trans_lock); + mutex_unlock(&root->fs_info->reloc_mutex); wake_up(&root->fs_info->transaction_wait); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 592396c6dc47..4ce8a9f41d1e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3177,7 +3177,7 @@ again: tmp_key.offset = (u64)-1; wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); - BUG_ON(!wc.replay_dest); + BUG_ON(IS_ERR_OR_NULL(wc.replay_dest)); wc.replay_dest->log_root = log; btrfs_record_root_in_trans(trans, wc.replay_dest); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1efa56e18f9b..19450bc53632 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2098,7 +2098,8 @@ int btrfs_balance(struct btrfs_root *dev_root) chunk_root->root_key.objectid, found_key.objectid, found_key.offset); - BUG_ON(ret && ret != -ENOSPC); + if (ret && ret != -ENOSPC) + goto error; key.offset = found_key.offset - 1; } ret = 0; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 9542f07d0b93..4698a5c553dc 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -290,7 +290,6 @@ static int striped_read(struct inode *inode, struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; int io_align, page_align; - int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; struct page **page_pos; @@ -326,12 +325,11 @@ more: ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); if (ret > 0) { - int didpages = - ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT; + int didpages = (page_align + ret) >> PAGE_CACHE_SHIFT; if (read < pos - off) { dout(" zero gap %llu to %llu\n", off + read, pos); - ceph_zero_page_vector_range(page_off + read, + ceph_zero_page_vector_range(page_align + read, pos - off - read, pages); } pos += ret; @@ -356,7 +354,7 @@ more: left = inode->i_size - pos; dout("zero tail %d\n", left); - ceph_zero_page_vector_range(page_off + read, left, + ceph_zero_page_vector_range(page_align + read, left, pages); read += left; } @@ -478,9 +476,6 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, else pos = *offset; - io_align = pos & ~PAGE_MASK; - buf_align = (unsigned long)data & ~PAGE_MASK; - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -504,6 +499,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, * boundary. this isn't atomic, unfortunately. :( */ more: + io_align = pos & ~PAGE_MASK; + buf_align = (unsigned long)data & ~PAGE_MASK; len = left; if (file->f_flags & O_DIRECT) { /* write from beginning of first page, regardless of @@ -593,6 +590,7 @@ out: pos += len; written += len; left -= len; + data += written; if (left) goto more; diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 53ed1ad2c112..f66cc1625150 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -156,6 +156,6 @@ config CIFS_ACL config CIFS_NFSD_EXPORT bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" - depends on CIFS && EXPERIMENTAL + depends on CIFS && EXPERIMENTAL && BROKEN help Allows NFS server to export a CIFS mounted share (nfsd over cifs) diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index ffb1459dc6ec..7260e11e21f8 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -42,6 +42,7 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ +#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index e9def996e383..3e2989976297 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -104,8 +104,7 @@ cifs_sb_deactive(struct super_block *sb) } static int -cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, - const char *devname, int silent) +cifs_read_super(struct super_block *sb) { struct inode *inode; struct cifs_sb_info *cifs_sb; @@ -113,22 +112,16 @@ cifs_read_super(struct super_block *sb, struct smb_vol *volume_info, cifs_sb = CIFS_SB(sb); - spin_lock_init(&cifs_sb->tlink_tree_lock); - cifs_sb->tlink_tree = RB_ROOT; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) + sb->s_flags |= MS_POSIXACL; - rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); - if (rc) - return rc; - - cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; + if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; - rc = cifs_mount(sb, cifs_sb, volume_info, devname); - - if (rc) { - if (!silent) - cERROR(1, "cifs_mount failed w/return code = %d", rc); - goto out_mount_failed; - } + /* BB FIXME fix time_gran to be larger for LANMAN sessions */ + sb->s_time_gran = 100; sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; @@ -170,37 +163,14 @@ out_no_root: if (inode) iput(inode); - cifs_umount(sb, cifs_sb); - -out_mount_failed: - bdi_destroy(&cifs_sb->bdi); return rc; } -static void -cifs_put_super(struct super_block *sb) +static void cifs_kill_sb(struct super_block *sb) { - int rc = 0; - struct cifs_sb_info *cifs_sb; - - cFYI(1, "In cifs_put_super"); - cifs_sb = CIFS_SB(sb); - if (cifs_sb == NULL) { - cFYI(1, "Empty cifs superblock info passed to unmount"); - return; - } - - rc = cifs_umount(sb, cifs_sb); - if (rc) - cERROR(1, "cifs_umount failed with return code %d", rc); - if (cifs_sb->mountdata) { - kfree(cifs_sb->mountdata); - cifs_sb->mountdata = NULL; - } - - unload_nls(cifs_sb->local_nls); - bdi_destroy(&cifs_sb->bdi); - kfree(cifs_sb); + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + kill_anon_super(sb); + cifs_umount(cifs_sb); } static int @@ -257,9 +227,6 @@ static int cifs_permission(struct inode *inode, int mask, unsigned int flags) { struct cifs_sb_info *cifs_sb; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - cifs_sb = CIFS_SB(inode->i_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) { @@ -551,7 +518,6 @@ static int cifs_drop_inode(struct inode *inode) } static const struct super_operations cifs_super_ops = { - .put_super = cifs_put_super, .statfs = cifs_statfs, .alloc_inode = cifs_alloc_inode, .destroy_inode = cifs_destroy_inode, @@ -588,7 +554,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) full_path = cifs_build_path_to_root(vol, cifs_sb, cifs_sb_master_tcon(cifs_sb)); if (full_path == NULL) - return NULL; + return ERR_PTR(-ENOMEM); cFYI(1, "Get root dentry for %s", full_path); @@ -617,7 +583,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dchild = d_alloc(dparent, &name); if (dchild == NULL) { dput(dparent); - dparent = NULL; + dparent = ERR_PTR(-ENOMEM); goto out; } } @@ -635,7 +601,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) if (rc) { dput(dchild); dput(dparent); - dparent = NULL; + dparent = ERR_PTR(rc); goto out; } alias = d_materialise_unique(dchild, inode); @@ -643,7 +609,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) dput(dchild); if (IS_ERR(alias)) { dput(dparent); - dparent = NULL; + dparent = ERR_PTR(-EINVAL); /* XXX */ goto out; } dchild = alias; @@ -663,6 +629,13 @@ out: return dparent; } +static int cifs_set_super(struct super_block *sb, void *data) +{ + struct cifs_mnt_data *mnt_data = data; + sb->s_fs_info = mnt_data->cifs_sb; + return set_anon_super(sb, NULL); +} + static struct dentry * cifs_do_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) @@ -676,82 +649,80 @@ cifs_do_mount(struct file_system_type *fs_type, cFYI(1, "Devname: %s flags: %d ", dev_name, flags); - rc = cifs_setup_volume_info(&volume_info, (char *)data, dev_name); - if (rc) - return ERR_PTR(rc); + volume_info = cifs_get_volume_info((char *)data, dev_name); + if (IS_ERR(volume_info)) + return ERR_CAST(volume_info); cifs_sb = kzalloc(sizeof(struct cifs_sb_info), GFP_KERNEL); if (cifs_sb == NULL) { root = ERR_PTR(-ENOMEM); - goto out; + goto out_nls; + } + + cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); + if (cifs_sb->mountdata == NULL) { + root = ERR_PTR(-ENOMEM); + goto out_cifs_sb; } cifs_setup_cifs_sb(volume_info, cifs_sb); + rc = cifs_mount(cifs_sb, volume_info); + if (rc) { + if (!(flags & MS_SILENT)) + cERROR(1, "cifs_mount failed w/return code = %d", rc); + root = ERR_PTR(rc); + goto out_mountdata; + } + mnt_data.vol = volume_info; mnt_data.cifs_sb = cifs_sb; mnt_data.flags = flags; - sb = sget(fs_type, cifs_match_super, set_anon_super, &mnt_data); + sb = sget(fs_type, cifs_match_super, cifs_set_super, &mnt_data); if (IS_ERR(sb)) { root = ERR_CAST(sb); - goto out_cifs_sb; + cifs_umount(cifs_sb); + goto out; } - if (sb->s_fs_info) { + if (sb->s_root) { cFYI(1, "Use existing superblock"); - goto out_shared; - } - - /* - * Copy mount params for use in submounts. Better to do - * the copy here and deal with the error before cleanup gets - * complicated post-mount. - */ - cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); - if (cifs_sb->mountdata == NULL) { - root = ERR_PTR(-ENOMEM); - goto out_super; - } - - sb->s_flags = flags; - /* BB should we make this contingent on mount parm? */ - sb->s_flags |= MS_NODIRATIME | MS_NOATIME; - sb->s_fs_info = cifs_sb; + cifs_umount(cifs_sb); + } else { + sb->s_flags = flags; + /* BB should we make this contingent on mount parm? */ + sb->s_flags |= MS_NODIRATIME | MS_NOATIME; + + rc = cifs_read_super(sb); + if (rc) { + root = ERR_PTR(rc); + goto out_super; + } - rc = cifs_read_super(sb, volume_info, dev_name, - flags & MS_SILENT ? 1 : 0); - if (rc) { - root = ERR_PTR(rc); - goto out_super; + sb->s_flags |= MS_ACTIVE; } - sb->s_flags |= MS_ACTIVE; - root = cifs_get_root(volume_info, sb); - if (root == NULL) + if (IS_ERR(root)) goto out_super; cFYI(1, "dentry root is: %p", root); goto out; -out_shared: - root = cifs_get_root(volume_info, sb); - if (root) - cFYI(1, "dentry root is: %p", root); - goto out; - out_super: - kfree(cifs_sb->mountdata); deactivate_locked_super(sb); +out: + cifs_cleanup_volume_info(volume_info); + return root; +out_mountdata: + kfree(cifs_sb->mountdata); out_cifs_sb: - unload_nls(cifs_sb->local_nls); kfree(cifs_sb); - -out: - cifs_cleanup_volume_info(&volume_info); - return root; +out_nls: + unload_nls(volume_info->local_nls); + goto out; } static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, @@ -840,7 +811,7 @@ struct file_system_type cifs_fs_type = { .owner = THIS_MODULE, .name = "cifs", .mount = cifs_do_mount, - .kill_sb = kill_anon_super, + .kill_sb = cifs_kill_sb, /* .fs_flags */ }; const struct inode_operations cifs_dir_inode_ops = { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 953f84413c77..8df28e925e5b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -154,12 +154,11 @@ extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, struct cifs_sb_info *cifs_sb); extern int cifs_match_super(struct super_block *, void *); -extern void cifs_cleanup_volume_info(struct smb_vol **pvolume_info); -extern int cifs_setup_volume_info(struct smb_vol **pvolume_info, - char *mount_data, const char *devname); -extern int cifs_mount(struct super_block *, struct cifs_sb_info *, - struct smb_vol *, const char *); -extern int cifs_umount(struct super_block *, struct cifs_sb_info *); +extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info); +extern struct smb_vol *cifs_get_volume_info(char *mount_data, + const char *devname); +extern int cifs_mount(struct cifs_sb_info *, struct smb_vol *); +extern void cifs_umount(struct cifs_sb_info *); extern void cifs_dfs_release_automount_timer(void); void cifs_proc_init(void); void cifs_proc_clean(void); @@ -218,7 +217,8 @@ extern int get_dfs_path(int xid, struct cifs_ses *pSesInfo, struct dfs_info3_param **preferrals, int remap); extern void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, - struct super_block *sb, struct smb_vol *vol); + struct cifs_sb_info *cifs_sb, + struct smb_vol *vol); extern int CIFSSMBQFSInfo(const int xid, struct cifs_tcon *tcon, struct kstatfs *FSData); extern int SMBOldQFSInfo(const int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 12cf72dd0c42..dbd669cc5bc7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -65,6 +65,8 @@ static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); +static int cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname); /* * cifs tcp session reconnection @@ -2240,8 +2242,8 @@ cifs_match_super(struct super_block *sb, void *data) rc = compare_mount_options(sb, mnt_data); out: - cifs_put_tlink(tlink); spin_unlock(&cifs_tcp_ses_lock); + cifs_put_tlink(tlink); return rc; } @@ -2474,14 +2476,6 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) return rc; - rc = socket->ops->connect(socket, saddr, slen, 0); - if (rc < 0) { - cFYI(1, "Error %d connecting to server", rc); - sock_release(socket); - server->ssocket = NULL; - return rc; - } - /* * Eventually check for other socket options to change from * the default. sock_setsockopt not used because it expects @@ -2510,6 +2504,14 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); + rc = socket->ops->connect(socket, saddr, slen, 0); + if (rc < 0) { + cFYI(1, "Error %d connecting to server", rc); + sock_release(socket); + server->ssocket = NULL; + return rc; + } + if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); @@ -2546,7 +2548,7 @@ ip_connect(struct TCP_Server_Info *server) } void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, - struct super_block *sb, struct smb_vol *vol_info) + struct cifs_sb_info *cifs_sb, struct smb_vol *vol_info) { /* if we are reconnecting then should we check to see if * any requested capabilities changed locally e.g. via @@ -2600,22 +2602,23 @@ void reset_cifs_unix_caps(int xid, struct cifs_tcon *tcon, cap &= ~CIFS_UNIX_POSIX_ACL_CAP; else if (CIFS_UNIX_POSIX_ACL_CAP & cap) { cFYI(1, "negotiated posix acl support"); - if (sb) - sb->s_flags |= MS_POSIXACL; + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= + CIFS_MOUNT_POSIXACL; } if (vol_info && vol_info->posix_paths == 0) cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; else if (cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { cFYI(1, "negotiate posix pathnames"); - if (sb) - CIFS_SB(sb)->mnt_cifs_flags |= + if (cifs_sb) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; } - if (sb && (CIFS_SB(sb)->rsize > 127 * 1024)) { + if (cifs_sb && (cifs_sb->rsize > 127 * 1024)) { if ((cap & CIFS_UNIX_LARGE_READ_CAP) == 0) { - CIFS_SB(sb)->rsize = 127 * 1024; + cifs_sb->rsize = 127 * 1024; cFYI(DBG2, "larger reads not supported by srv"); } } @@ -2662,6 +2665,9 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, { INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); + spin_lock_init(&cifs_sb->tlink_tree_lock); + cifs_sb->tlink_tree = RB_ROOT; + if (pvolume_info->rsize > CIFSMaxBufSize) { cERROR(1, "rsize %d too large, using MaxBufSize", pvolume_info->rsize); @@ -2750,21 +2756,21 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, /* * When the server supports very large writes via POSIX extensions, we can - * allow up to 2^24 - PAGE_CACHE_SIZE. + * allow up to 2^24-1, minus the size of a WRITE_AND_X header, not including + * the RFC1001 length. * * Note that this might make for "interesting" allocation problems during - * writeback however (as we have to allocate an array of pointers for the - * pages). A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. + * writeback however as we have to allocate an array of pointers for the + * pages. A 16M write means ~32kb page array with PAGE_CACHE_SIZE == 4096. */ -#define CIFS_MAX_WSIZE ((1<<24) - PAGE_CACHE_SIZE) +#define CIFS_MAX_WSIZE ((1<<24) - 1 - sizeof(WRITE_REQ) + 4) /* - * When the server doesn't allow large posix writes, default to a wsize of - * 128k - PAGE_CACHE_SIZE -- one page less than the largest frame size - * described in RFC1001. This allows space for the header without going over - * that by default. + * When the server doesn't allow large posix writes, only allow a wsize of + * 128k minus the size of the WRITE_AND_X header. That allows for a write up + * to the maximum size described by RFC1002. */ -#define CIFS_MAX_RFC1001_WSIZE (128 * 1024 - PAGE_CACHE_SIZE) +#define CIFS_MAX_RFC1002_WSIZE (128 * 1024 - sizeof(WRITE_REQ) + 4) /* * The default wsize is 1M. find_get_pages seems to return a maximum of 256 @@ -2783,11 +2789,18 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) /* can server support 24-bit write sizes? (via UNIX extensions) */ if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) - wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1001_WSIZE); + wsize = min_t(unsigned int, wsize, CIFS_MAX_RFC1002_WSIZE); - /* no CAP_LARGE_WRITE_X? Limit it to 16 bits */ - if (!(server->capabilities & CAP_LARGE_WRITE_X)) - wsize = min_t(unsigned int, wsize, USHRT_MAX); + /* + * no CAP_LARGE_WRITE_X or is signing enabled without CAP_UNIX set? + * Limit it to max buffer offered by the server, minus the size of the + * WRITEX header, not including the 4 byte RFC1001 length. + */ + if (!(server->capabilities & CAP_LARGE_WRITE_X) || + (!(server->capabilities & CAP_UNIX) && + (server->sec_mode & (SECMODE_SIGN_ENABLED|SECMODE_SIGN_REQUIRED)))) + wsize = min_t(unsigned int, wsize, + server->maxBuf - sizeof(WRITE_REQ) + 4); /* hard limit of CIFS_MAX_WSIZE */ wsize = min_t(unsigned int, wsize, CIFS_MAX_WSIZE); @@ -2819,15 +2832,9 @@ is_path_accessible(int xid, struct cifs_tcon *tcon, return rc; } -void -cifs_cleanup_volume_info(struct smb_vol **pvolume_info) +static void +cleanup_volume_info_contents(struct smb_vol *volume_info) { - struct smb_vol *volume_info; - - if (!pvolume_info || !*pvolume_info) - return; - - volume_info = *pvolume_info; kfree(volume_info->username); kzfree(volume_info->password); kfree(volume_info->UNC); @@ -2835,28 +2842,44 @@ cifs_cleanup_volume_info(struct smb_vol **pvolume_info) kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); +} + +void +cifs_cleanup_volume_info(struct smb_vol *volume_info) +{ + if (!volume_info) + return; + cleanup_volume_info_contents(volume_info); kfree(volume_info); - *pvolume_info = NULL; - return; } + #ifdef CONFIG_CIFS_DFS_UPCALL /* build_path_to_root returns full path to root when * we do not have an exiting connection (tcon) */ static char * -build_unc_path_to_root(const struct smb_vol *volume_info, +build_unc_path_to_root(const struct smb_vol *vol, const struct cifs_sb_info *cifs_sb) { - char *full_path; + char *full_path, *pos; + unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0; + unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1); - int unc_len = strnlen(volume_info->UNC, MAX_TREE_SIZE + 1); - full_path = kmalloc(unc_len + 1, GFP_KERNEL); + full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL); if (full_path == NULL) return ERR_PTR(-ENOMEM); - strncpy(full_path, volume_info->UNC, unc_len); - full_path[unc_len] = 0; /* add trailing null */ + strncpy(full_path, vol->UNC, unc_len); + pos = full_path + unc_len; + + if (pplen) { + strncpy(pos, vol->prepath, pplen); + pos += pplen; + } + + *pos = '\0'; /* add trailing null */ convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); + cFYI(1, "%s: full_path=%s", __func__, full_path); return full_path; } @@ -2899,15 +2922,18 @@ expand_dfs_referral(int xid, struct cifs_ses *pSesInfo, &fake_devname); free_dfs_info_array(referrals, num_referrals); - kfree(fake_devname); - - if (cifs_sb->mountdata != NULL) - kfree(cifs_sb->mountdata); if (IS_ERR(mdata)) { rc = PTR_ERR(mdata); mdata = NULL; + } else { + cleanup_volume_info_contents(volume_info); + memset(volume_info, '\0', sizeof(*volume_info)); + rc = cifs_setup_volume_info(volume_info, mdata, + fake_devname); } + kfree(fake_devname); + kfree(cifs_sb->mountdata); cifs_sb->mountdata = mdata; } kfree(full_path); @@ -2915,29 +2941,20 @@ expand_dfs_referral(int xid, struct cifs_ses *pSesInfo, } #endif -int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, - const char *devname) +static int +cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data, + const char *devname) { - struct smb_vol *volume_info; int rc = 0; - *pvolume_info = NULL; - - volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); - if (!volume_info) { - rc = -ENOMEM; - goto out; - } - - if (cifs_parse_mount_options(mount_data, devname, - volume_info)) { - rc = -EINVAL; - goto out; - } + if (cifs_parse_mount_options(mount_data, devname, volume_info)) + return -EINVAL; if (volume_info->nullauth) { cFYI(1, "null user"); - volume_info->username = ""; + volume_info->username = kzalloc(1, GFP_KERNEL); + if (volume_info->username == NULL) + return -ENOMEM; } else if (volume_info->username) { /* BB fixme parse for domain name here */ cFYI(1, "Username: %s", volume_info->username); @@ -2945,8 +2962,7 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, cifserror("No username specified"); /* In userspace mount helper we can get user name from alternate locations such as env variables and files on disk */ - rc = -EINVAL; - goto out; + return -EINVAL; } /* this is needed for ASCII cp to Unicode converts */ @@ -2958,21 +2974,34 @@ int cifs_setup_volume_info(struct smb_vol **pvolume_info, char *mount_data, if (volume_info->local_nls == NULL) { cERROR(1, "CIFS mount error: iocharset %s not found", volume_info->iocharset); - rc = -ELIBACC; - goto out; + return -ELIBACC; } } - *pvolume_info = volume_info; - return rc; -out: - cifs_cleanup_volume_info(&volume_info); return rc; } +struct smb_vol * +cifs_get_volume_info(char *mount_data, const char *devname) +{ + int rc; + struct smb_vol *volume_info; + + volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL); + if (!volume_info) + return ERR_PTR(-ENOMEM); + + rc = cifs_setup_volume_info(volume_info, mount_data, devname); + if (rc) { + cifs_cleanup_volume_info(volume_info); + volume_info = ERR_PTR(rc); + } + + return volume_info; +} + int -cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, - struct smb_vol *volume_info, const char *devname) +cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) { int rc = 0; int xid; @@ -2983,6 +3012,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, struct tcon_link *tlink; #ifdef CONFIG_CIFS_DFS_UPCALL int referral_walks_count = 0; +#endif + + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); + if (rc) + return rc; + + cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; + +#ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ if (referral_walks_count) { @@ -2991,7 +3029,6 @@ try_mount_again: else if (pSesInfo) cifs_put_smb_ses(pSesInfo); - cifs_cleanup_volume_info(&volume_info); FreeXid(xid); } #endif @@ -3007,6 +3044,7 @@ try_mount_again: srvTcp = cifs_get_tcp_session(volume_info); if (IS_ERR(srvTcp)) { rc = PTR_ERR(srvTcp); + bdi_destroy(&cifs_sb->bdi); goto out; } @@ -3018,14 +3056,6 @@ try_mount_again: goto mount_fail_check; } - if (pSesInfo->capabilities & CAP_LARGE_FILES) - sb->s_maxbytes = MAX_LFS_FILESIZE; - else - sb->s_maxbytes = MAX_NON_LFS; - - /* BB FIXME fix time_gran to be larger for LANMAN sessions */ - sb->s_time_gran = 100; - /* search for existing tcon to this server share */ tcon = cifs_get_tcon(pSesInfo, volume_info); if (IS_ERR(tcon)) { @@ -3038,7 +3068,7 @@ try_mount_again: if (tcon->ses->capabilities & CAP_UNIX) { /* reset of caps checks mount to see if unix extensions disabled for just this mount */ - reset_cifs_unix_caps(xid, tcon, sb, volume_info); + reset_cifs_unix_caps(xid, tcon, cifs_sb, volume_info); if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) && (le64_to_cpu(tcon->fsUnixInfo.Capability) & CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP)) { @@ -3161,6 +3191,7 @@ mount_fail_check: cifs_put_smb_ses(pSesInfo); else cifs_put_tcp_session(srvTcp); + bdi_destroy(&cifs_sb->bdi); goto out; } @@ -3335,8 +3366,8 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses, return rc; } -int -cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) +void +cifs_umount(struct cifs_sb_info *cifs_sb) { struct rb_root *root = &cifs_sb->tlink_tree; struct rb_node *node; @@ -3357,7 +3388,10 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); - return 0; + bdi_destroy(&cifs_sb->bdi); + kfree(cifs_sb->mountdata); + unload_nls(cifs_sb->local_nls); + kfree(cifs_sb); } int cifs_negotiate_protocol(unsigned int xid, struct cifs_ses *ses) diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index 816696621ec9..42e5363b4102 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -92,6 +92,7 @@ static void cifs_fscache_disable_inode_cookie(struct inode *inode) if (cifsi->fscache) { cFYI(1, "%s: (0x%p)", __func__, cifsi->fscache); + fscache_uncache_all_inode_pages(cifsi->fscache, inode); fscache_relinquish_cookie(cifsi->fscache, 1); cifsi->fscache = NULL; } diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 1525d5e662b6..1c5b770c3141 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -90,12 +90,10 @@ smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) sg_init_one(&sgout, out, 8); rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); - if (rc) { + if (rc) cERROR(1, "could not encrypt crypt key rc: %d\n", rc); - crypto_free_blkcipher(tfm_des); - goto smbhash_err; - } + crypto_free_blkcipher(tfm_des); smbhash_err: return rc; } diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 6cbb3afb36dc..cb140ef293e4 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -43,8 +43,6 @@ const struct file_operations coda_ioctl_operations = { /* the coda pioctl inode ops */ static int coda_ioctl_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; return (mask & MAY_EXEC) ? -EACCES : 0; } diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 2e29abb30f76..095c36f3b612 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -125,7 +125,7 @@ struct ext4_ext_path { * positive retcode - signal for ext4_ext_walk_space(), see below * callback must return valid extent (passed or newly created) */ -typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, +typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t, struct ext4_ext_cache *, struct ext4_extent *, void *); @@ -133,8 +133,11 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, #define EXT_BREAK 1 #define EXT_REPEAT 2 -/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */ -#define EXT_MAX_BLOCK 0xffffffff +/* + * Maximum number of logical blocks in a file; ext4_extent's ee_block is + * __le32. + */ +#define EXT_MAX_BLOCKS 0xffffffff /* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5199bac7fc62..f815cc81e7a2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1408,7 +1408,7 @@ got_index: /* * ext4_ext_next_allocated_block: - * returns allocated block in subsequent extent or EXT_MAX_BLOCK. + * returns allocated block in subsequent extent or EXT_MAX_BLOCKS. * NOTE: it considers block number from index entry as * allocated block. Thus, index entries have to be consistent * with leaves. @@ -1422,7 +1422,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) depth = path->p_depth; if (depth == 0 && path->p_ext == NULL) - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; while (depth >= 0) { if (depth == path->p_depth) { @@ -1439,12 +1439,12 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) depth--; } - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; } /* * ext4_ext_next_leaf_block: - * returns first allocated block from next leaf or EXT_MAX_BLOCK + * returns first allocated block from next leaf or EXT_MAX_BLOCKS */ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, struct ext4_ext_path *path) @@ -1456,7 +1456,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, /* zero-tree has no leaf blocks at all */ if (depth == 0) - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; /* go to index block */ depth--; @@ -1469,7 +1469,7 @@ static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, depth--; } - return EXT_MAX_BLOCK; + return EXT_MAX_BLOCKS; } /* @@ -1677,13 +1677,13 @@ static unsigned int ext4_ext_check_overlap(struct inode *inode, */ if (b2 < b1) { b2 = ext4_ext_next_allocated_block(path); - if (b2 == EXT_MAX_BLOCK) + if (b2 == EXT_MAX_BLOCKS) goto out; } /* check for wrap through zero on extent logical start block*/ if (b1 + len1 < b1) { - len1 = EXT_MAX_BLOCK - b1; + len1 = EXT_MAX_BLOCKS - b1; newext->ee_len = cpu_to_le16(len1); ret = 1; } @@ -1767,7 +1767,7 @@ repeat: fex = EXT_LAST_EXTENT(eh); next = ext4_ext_next_leaf_block(inode, path); if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) - && next != EXT_MAX_BLOCK) { + && next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %d\n", next); BUG_ON(npath != NULL); npath = ext4_ext_find_extent(inode, next, NULL); @@ -1887,7 +1887,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, BUG_ON(func == NULL); BUG_ON(inode == NULL); - while (block < last && block != EXT_MAX_BLOCK) { + while (block < last && block != EXT_MAX_BLOCKS) { num = last - block; /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); @@ -1958,7 +1958,7 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, err = -EIO; break; } - err = func(inode, path, &cbex, ex, cbdata); + err = func(inode, next, &cbex, ex, cbdata); ext4_ext_drop_refs(path); if (err < 0) @@ -2020,7 +2020,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, if (ex == NULL) { /* there is no extent yet, so gap is [0;-] */ lblock = 0; - len = EXT_MAX_BLOCK; + len = EXT_MAX_BLOCKS; ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2350,7 +2350,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * never happen because at least one of the end points * needs to be on the edge of the extent. */ - if (end == EXT_MAX_BLOCK) { + if (end == EXT_MAX_BLOCKS - 1) { ext_debug(" bad truncate %u:%u\n", start, end); block = 0; @@ -2398,7 +2398,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * If this is a truncate, this condition * should never happen */ - if (end == EXT_MAX_BLOCK) { + if (end == EXT_MAX_BLOCKS - 1) { ext_debug(" bad truncate %u:%u\n", start, end); err = -EIO; @@ -2478,7 +2478,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, * we need to remove it from the leaf */ if (num == 0) { - if (end != EXT_MAX_BLOCK) { + if (end != EXT_MAX_BLOCKS - 1) { /* * For hole punching, we need to scoot all the * extents up when an extent is removed so that @@ -3699,7 +3699,7 @@ void ext4_ext_truncate(struct inode *inode) last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCK); + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); /* In a multi-transaction truncate, we only make the final * transaction synchronous. @@ -3914,14 +3914,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, /* * Callback function called for each extent to gather FIEMAP information. */ -static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, +static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { __u64 logical; __u64 physical; __u64 length; - loff_t size; __u32 flags = 0; int ret = 0; struct fiemap_extent_info *fieinfo = data; @@ -4103,8 +4102,7 @@ found_delayed_extent: if (ex && ext4_ext_is_uninitialized(ex)) flags |= FIEMAP_EXTENT_UNWRITTEN; - size = i_size_read(inode); - if (logical + length >= size) + if (next == EXT_MAX_BLOCKS) flags |= FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, physical, @@ -4347,8 +4345,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, start_blk = start >> inode->i_sb->s_blocksize_bits; last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; - if (last_blk >= EXT_MAX_BLOCK) - last_blk = EXT_MAX_BLOCK-1; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; /* diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a5763e3505ba..e3126c051006 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2634,7 +2634,7 @@ static int ext4_writepage(struct page *page, struct buffer_head *page_bufs = NULL; struct inode *inode = page->mapping->host; - trace_ext4_writepage(inode, page); + trace_ext4_writepage(page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 859f2ae8864e..6ed859d56850 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3578,8 +3578,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); - trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa, - grp_blk_start + bit, next - bit); + trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit, + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3608,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(sb, pa); + trace_ext4_mb_release_group_pa(pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -4448,7 +4448,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * @inode: inode * @block: start physical block to free * @count: number of blocks to count - * @metadata: Are these metadata blocks + * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2b8304bf3c50..f57455a1b1b2 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -1002,12 +1002,12 @@ mext_check_arguments(struct inode *orig_inode, return -EINVAL; } - if ((orig_start > EXT_MAX_BLOCK) || - (donor_start > EXT_MAX_BLOCK) || - (*len > EXT_MAX_BLOCK) || - (orig_start + *len > EXT_MAX_BLOCK)) { + if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || + (*len > EXT_MAX_BLOCKS) || + (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " - "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK, + "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cc5c157aa11d..9ea71aa864b3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2243,6 +2243,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, * in the vfs. ext4 inode has 48 bits of i_block in fsblock units, * so that won't be a limiting factor. * + * However there is other limiting factor. We do store extents in the form + * of starting block and length, hence the resulting length of the extent + * covering maximum file size must fit into on-disk format containers as + * well. Given that length is always by 1 unit bigger than max unit (because + * we count 0 as well) we have to lower the s_maxbytes by one fs block. + * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ static loff_t ext4_max_size(int blkbits, int has_huge_files) @@ -2264,10 +2270,13 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) upper_limit <<= blkbits; } - /* 32-bit extent-start container, ee_block */ - res = 1LL << 32; + /* + * 32-bit extent-start container, ee_block. We lower the maxbytes + * by one fs block, so ee_len can cover the extent of maximum file + * size + */ + res = (1LL << 32) - 1; res <<= blkbits; - res -= 1; /* Sanity check against vm- & vfs- imposed limits */ if (res > upper_limit) diff --git a/fs/fscache/page.c b/fs/fscache/page.c index a2a5d19ece6a..2f343b4d7a7d 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -954,3 +954,47 @@ void fscache_mark_pages_cached(struct fscache_retrieval *op, pagevec_reinit(pagevec); } EXPORT_SYMBOL(fscache_mark_pages_cached); + +/* + * Uncache all the pages in an inode that are marked PG_fscache, assuming them + * to be associated with the given cookie. + */ +void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, + struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + pgoff_t next; + int i; + + _enter("%p,%p", cookie, inode); + + if (!mapping || mapping->nrpages == 0) { + _leave(" [no pages]"); + return; + } + + pagevec_init(&pvec, 0); + next = 0; + while (next <= (loff_t)-1 && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE) + ) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + ASSERTCMP(page_index, >=, next); + next = page_index + 1; + + if (PageFsCache(page)) { + __fscache_wait_on_page_write(cookie, page); + __fscache_uncache_page(cookie, page); + } + } + pagevec_release(&pvec); + cond_resched(); + } + + _leave(""); +} +EXPORT_SYMBOL(__fscache_uncache_all_inode_pages); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index b49b55584c84..84a47b709f51 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -500,7 +500,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) out_put_hidden_dir: iput(sbi->hidden_dir); out_put_root: - iput(sbi->alloc_file); + iput(root); out_put_alloc_file: iput(sbi->alloc_file); out_close_cat_tree: diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 3031d81f5f0f..4ac88ff79aa6 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -36,6 +36,7 @@ int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, { DECLARE_COMPLETION_ONSTACK(wait); struct bio *bio; + int ret = 0; bio = bio_alloc(GFP_NOIO, 1); bio->bi_sector = sector; @@ -54,8 +55,10 @@ int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, wait_for_completion(&wait); if (!bio_flagged(bio, BIO_UPTODATE)) - return -EIO; - return 0; + ret = -EIO; + + bio_put(bio); + return ret; } static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) diff --git a/fs/inode.c b/fs/inode.c index 0f7e88a7803f..43566d17d1b8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -423,7 +423,14 @@ EXPORT_SYMBOL(remove_inode_hash); void end_writeback(struct inode *inode) { might_sleep(); + /* + * We have to cycle tree_lock here because reclaim can be still in the + * process of removing the last page (in __delete_from_page_cache()) + * and we must not free mapping under it. + */ + spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); + spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(inode->i_state & I_CLEAR); diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6a79fd0a1a32..2c62c5aae82f 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -97,10 +97,14 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { + /* + * Get our reference so that bh cannot be freed before + * we unlock it + */ + get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); } else { @@ -223,8 +227,8 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + get_bh(bh); if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -243,7 +247,6 @@ restart: */ released = __jbd2_journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); __brelse(bh); } @@ -284,7 +287,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, int ret = 0; if (buffer_locked(bh)) { - atomic_inc(&bh->b_count); + get_bh(bh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); wait_on_buffer(bh); @@ -316,12 +319,12 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, ret = 1; if (unlikely(buffer_write_io_error(bh))) ret = -EIO; + get_bh(bh); J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); __brelse(bh); } else { /* @@ -554,7 +557,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* * journal_clean_one_cp_list * - * Find all the written-back checkpoint buffers in the given list and release them. + * Find all the written-back checkpoint buffers in the given list and + * release them. * * Called with the journal locked. * Called with j_list_lock held. @@ -663,8 +667,8 @@ out: * checkpoint lists. * * The function returns 1 if it frees the transaction, 0 otherwise. + * The function can free jh and bh. * - * This function is called with the journal locked. * This function is called with j_list_lock held. * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ @@ -684,13 +688,14 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) } journal = transaction->t_journal; + JBUFFER_TRACE(jh, "removing from transaction"); __buffer_unlink(jh); jh->b_cp_transaction = NULL; + jbd2_journal_put_journal_head(jh); if (transaction->t_checkpoint_list != NULL || transaction->t_checkpoint_io_list != NULL) goto out; - JBUFFER_TRACE(jh, "transaction has no more buffers"); /* * There is one special case to worry about: if we have just pulled the @@ -701,10 +706,8 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) * The locking here around t_state is a bit sleazy. * See the comment at the end of jbd2_journal_commit_transaction(). */ - if (transaction->t_state != T_FINISHED) { - JBUFFER_TRACE(jh, "belongs to running/committing transaction"); + if (transaction->t_state != T_FINISHED) goto out; - } /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ @@ -723,7 +726,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) wake_up(&journal->j_wait_logspace); ret = 1; out: - JBUFFER_TRACE(jh, "exit"); return ret; } @@ -742,6 +744,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jbddirty(jh2bh(jh))); J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + /* Get reference for checkpointing transaction */ + jbd2_journal_grab_journal_head(jh2bh(jh)); jh->b_cp_transaction = transaction; if (!transaction->t_checkpoint_list) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 7f21cf3aaf92..eef6979821a4 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -848,10 +848,16 @@ restart_loop: while (commit_transaction->t_forget) { transaction_t *cp_transaction; struct buffer_head *bh; + int try_to_free = 0; jh = commit_transaction->t_forget; spin_unlock(&journal->j_list_lock); bh = jh2bh(jh); + /* + * Get a reference so that bh cannot be freed before we are + * done with it. + */ + get_bh(bh); jbd_lock_bh_state(bh); J_ASSERT_JH(jh, jh->b_transaction == commit_transaction); @@ -914,28 +920,27 @@ restart_loop: __jbd2_journal_insert_checkpoint(jh, commit_transaction); if (is_journal_aborted(journal)) clear_buffer_jbddirty(bh); - JBUFFER_TRACE(jh, "refile for checkpoint writeback"); - __jbd2_journal_refile_buffer(jh); - jbd_unlock_bh_state(bh); } else { J_ASSERT_BH(bh, !buffer_dirty(bh)); - /* The buffer on BJ_Forget list and not jbddirty means + /* + * The buffer on BJ_Forget list and not jbddirty means * it has been freed by this transaction and hence it * could not have been reallocated until this * transaction has committed. *BUT* it could be * reallocated once we have written all the data to * disk and before we process the buffer on BJ_Forget - * list. */ - JBUFFER_TRACE(jh, "refile or unfile freed buffer"); - __jbd2_journal_refile_buffer(jh); - if (!jh->b_transaction) { - jbd_unlock_bh_state(bh); - /* needs a brelse */ - jbd2_journal_remove_journal_head(bh); - release_buffer_page(bh); - } else - jbd_unlock_bh_state(bh); + * list. + */ + if (!jh->b_next_transaction) + try_to_free = 1; } + JBUFFER_TRACE(jh, "refile or unfile buffer"); + __jbd2_journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + if (try_to_free) + release_buffer_page(bh); /* Drops bh reference */ + else + __brelse(bh); cond_resched_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 9a7826990304..0dfa5b598e68 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2078,10 +2078,9 @@ static void journal_free_journal_head(struct journal_head *jh) * When a buffer has its BH_JBD bit set it is immune from being released by * core kernel code, mainly via ->b_count. * - * A journal_head may be detached from its buffer_head when the journal_head's - * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. - * Various places in JBD call jbd2_journal_remove_journal_head() to indicate that the - * journal_head can be dropped if needed. + * A journal_head is detached from its buffer_head when the journal_head's + * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint + * transaction (b_cp_transaction) hold their references to b_jcount. * * Various places in the kernel want to attach a journal_head to a buffer_head * _before_ attaching the journal_head to a transaction. To protect the @@ -2094,17 +2093,16 @@ static void journal_free_journal_head(struct journal_head *jh) * (Attach a journal_head if needed. Increments b_jcount) * struct journal_head *jh = jbd2_journal_add_journal_head(bh); * ... + * (Get another reference for transaction) + * jbd2_journal_grab_journal_head(bh); * jh->b_transaction = xxx; + * (Put original reference) * jbd2_journal_put_journal_head(jh); - * - * Now, the journal_head's b_jcount is zero, but it is safe from being released - * because it has a non-zero b_transaction. */ /* * Give a buffer_head a journal_head. * - * Doesn't need the journal lock. * May sleep. */ struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh) @@ -2168,61 +2166,29 @@ static void __journal_remove_journal_head(struct buffer_head *bh) struct journal_head *jh = bh2jh(bh); J_ASSERT_JH(jh, jh->b_jcount >= 0); - - get_bh(bh); - if (jh->b_jcount == 0) { - if (jh->b_transaction == NULL && - jh->b_next_transaction == NULL && - jh->b_cp_transaction == NULL) { - J_ASSERT_JH(jh, jh->b_jlist == BJ_None); - J_ASSERT_BH(bh, buffer_jbd(bh)); - J_ASSERT_BH(bh, jh2bh(jh) == bh); - BUFFER_TRACE(bh, "remove journal_head"); - if (jh->b_frozen_data) { - printk(KERN_WARNING "%s: freeing " - "b_frozen_data\n", - __func__); - jbd2_free(jh->b_frozen_data, bh->b_size); - } - if (jh->b_committed_data) { - printk(KERN_WARNING "%s: freeing " - "b_committed_data\n", - __func__); - jbd2_free(jh->b_committed_data, bh->b_size); - } - bh->b_private = NULL; - jh->b_bh = NULL; /* debug, really */ - clear_buffer_jbd(bh); - __brelse(bh); - journal_free_journal_head(jh); - } else { - BUFFER_TRACE(bh, "journal_head was locked"); - } + J_ASSERT_JH(jh, jh->b_transaction == NULL); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + J_ASSERT_JH(jh, jh->b_jlist == BJ_None); + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + if (jh->b_frozen_data) { + printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__); + jbd2_free(jh->b_frozen_data, bh->b_size); } + if (jh->b_committed_data) { + printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__); + jbd2_free(jh->b_committed_data, bh->b_size); + } + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_buffer_jbd(bh); + journal_free_journal_head(jh); } /* - * jbd2_journal_remove_journal_head(): if the buffer isn't attached to a transaction - * and has a zero b_jcount then remove and release its journal_head. If we did - * see that the buffer is not used by any transaction we also "logically" - * decrement ->b_count. - * - * We in fact take an additional increment on ->b_count as a convenience, - * because the caller usually wants to do additional things with the bh - * after calling here. - * The caller of jbd2_journal_remove_journal_head() *must* run __brelse(bh) at some - * time. Once the caller has run __brelse(), the buffer is eligible for - * reaping by try_to_free_buffers(). - */ -void jbd2_journal_remove_journal_head(struct buffer_head *bh) -{ - jbd_lock_bh_journal_head(bh); - __journal_remove_journal_head(bh); - jbd_unlock_bh_journal_head(bh); -} - -/* - * Drop a reference on the passed journal_head. If it fell to zero then try to + * Drop a reference on the passed journal_head. If it fell to zero then * release the journal_head from the buffer_head. */ void jbd2_journal_put_journal_head(struct journal_head *jh) @@ -2232,11 +2198,12 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) jbd_lock_bh_journal_head(bh); J_ASSERT_JH(jh, jh->b_jcount > 0); --jh->b_jcount; - if (!jh->b_jcount && !jh->b_transaction) { + if (!jh->b_jcount) { __journal_remove_journal_head(bh); + jbd_unlock_bh_journal_head(bh); __brelse(bh); - } - jbd_unlock_bh_journal_head(bh); + } else + jbd_unlock_bh_journal_head(bh); } /* diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3eec82d32fd4..2d7109414cdd 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -30,6 +30,7 @@ #include <linux/module.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); +static void __jbd2_journal_unfile_buffer(struct journal_head *jh); /* * jbd2_get_transaction: obtain a new transaction_t object. @@ -764,7 +765,6 @@ repeat: if (!jh->b_transaction) { JBUFFER_TRACE(jh, "no transaction"); J_ASSERT_JH(jh, !jh->b_next_transaction); - jh->b_transaction = transaction; JBUFFER_TRACE(jh, "file as BJ_Reserved"); spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); @@ -814,7 +814,6 @@ out: * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes - * @credits: variable that will receive credits for the buffer * * Returns an error code or 0 on success. * @@ -896,8 +895,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * committed and so it's safe to clear the dirty bit. */ clear_buffer_dirty(jh2bh(jh)); - jh->b_transaction = transaction; - /* first access by this transaction */ jh->b_modified = 0; @@ -932,7 +929,6 @@ out: * non-rewindable consequences * @handle: transaction * @bh: buffer to undo - * @credits: store the number of taken credits here (if not NULL) * * Sometimes there is a need to distinguish between metadata which has * been committed to disk and that which has not. The ext3fs code uses @@ -1232,8 +1228,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); } else { __jbd2_journal_unfile_buffer(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); @@ -1556,19 +1550,32 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) mark_buffer_dirty(bh); /* Expose it to the VM */ } -void __jbd2_journal_unfile_buffer(struct journal_head *jh) +/* + * Remove buffer from all transactions. + * + * Called with bh_state lock and j_list_lock + * + * jh and bh may be already freed when this function returns. + */ +static void __jbd2_journal_unfile_buffer(struct journal_head *jh) { __jbd2_journal_temp_unlink_buffer(jh); jh->b_transaction = NULL; + jbd2_journal_put_journal_head(jh); } void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) { - jbd_lock_bh_state(jh2bh(jh)); + struct buffer_head *bh = jh2bh(jh); + + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); + jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); __jbd2_journal_unfile_buffer(jh); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(jh2bh(jh)); + jbd_unlock_bh_state(bh); + __brelse(bh); } /* @@ -1595,8 +1602,6 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); __jbd2_journal_remove_checkpoint(jh); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); } } spin_unlock(&journal->j_list_lock); @@ -1659,7 +1664,6 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, /* * We take our own ref against the journal_head here to avoid * having to add tons of locking around each instance of - * jbd2_journal_remove_journal_head() and * jbd2_journal_put_journal_head(). */ jh = jbd2_journal_grab_journal_head(bh); @@ -1697,10 +1701,9 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) int may_free = 1; struct buffer_head *bh = jh2bh(jh); - __jbd2_journal_unfile_buffer(jh); - if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + __jbd2_journal_temp_unlink_buffer(jh); /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in @@ -1711,8 +1714,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); - jbd2_journal_remove_journal_head(bh); - __brelse(bh); + __jbd2_journal_unfile_buffer(jh); } return may_free; } @@ -1990,6 +1992,8 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, if (jh->b_transaction) __jbd2_journal_temp_unlink_buffer(jh); + else + jbd2_journal_grab_journal_head(bh); jh->b_transaction = transaction; switch (jlist) { @@ -2041,9 +2045,10 @@ void jbd2_journal_file_buffer(struct journal_head *jh, * already started to be used by a subsequent transaction, refile the * buffer on that transaction's metadata list. * - * Called under journal->j_list_lock - * + * Called under j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)) + * + * jh and bh may be already free when this function returns */ void __jbd2_journal_refile_buffer(struct journal_head *jh) { @@ -2067,6 +2072,11 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) was_dirty = test_clear_buffer_jbddirty(bh); __jbd2_journal_temp_unlink_buffer(jh); + /* + * We set b_transaction here because b_next_transaction will inherit + * our jh reference and thus __jbd2_journal_file_buffer() must not + * take a new one. + */ jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; if (buffer_freed(bh)) @@ -2083,30 +2093,21 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) } /* - * For the unlocked version of this call, also make sure that any - * hanging journal_head is cleaned up if necessary. - * - * __jbd2_journal_refile_buffer is usually called as part of a single locked - * operation on a buffer_head, in which the caller is probably going to - * be hooking the journal_head onto other lists. In that case it is up - * to the caller to remove the journal_head if necessary. For the - * unlocked jbd2_journal_refile_buffer call, the caller isn't going to be - * doing anything else to the buffer so we need to do the cleanup - * ourselves to avoid a jh leak. - * - * *** The journal_head may be freed by this call! *** + * __jbd2_journal_refile_buffer() with necessary locking added. We take our + * bh reference so that we can safely unlock bh. + * + * The jh and bh may be freed by this call. */ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) { struct buffer_head *bh = jh2bh(jh); + /* Get reference so that buffer cannot be freed before we unlock it */ + get_bh(bh); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); - __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); - jbd2_journal_remove_journal_head(bh); - spin_unlock(&journal->j_list_lock); __brelse(bh); } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index c5ce6c1d1ff4..2f3f531f3606 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -66,9 +66,9 @@ static int jfs_open(struct inode *inode, struct file *file) struct jfs_inode_info *ji = JFS_IP(inode); spin_lock_irq(&ji->ag_lock); if (ji->active_ag == -1) { - ji->active_ag = ji->agno; - atomic_inc( - &JFS_SBI(inode->i_sb)->bmap->db_active[ji->agno]); + struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); + ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); + atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index ed53a4740168..b78b2f978f04 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -397,7 +397,7 @@ int diRead(struct inode *ip) release_metapage(mp); /* set the ag for the inode */ - JFS_IP(ip)->agno = BLKTOAG(agstart, sbi); + JFS_IP(ip)->agstart = agstart; JFS_IP(ip)->active_ag = -1; return (rc); @@ -901,7 +901,7 @@ int diFree(struct inode *ip) /* get the allocation group for this ino. */ - agno = JFS_IP(ip)->agno; + agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb)); /* Lock the AG specific inode map information */ @@ -1315,12 +1315,11 @@ int diFree(struct inode *ip) static inline void diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) { - struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); ip->i_ino = (iagno << L2INOSPERIAG) + ino; jfs_ip->ixpxd = iagp->inoext[extno]; - jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); + jfs_ip->agstart = le64_to_cpu(iagp->agstart); jfs_ip->active_ag = -1; } @@ -1379,7 +1378,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) */ /* get the ag number of this iag */ - agno = JFS_IP(pip)->agno; + agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { /* @@ -2921,10 +2920,9 @@ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) continue; } - /* agstart that computes to the same ag is treated as same; */ agstart = le64_to_cpu(iagp->agstart); - /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */ n = agstart >> mp->db_agl2size; + iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size); /* compute backed inodes */ numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 1439f119ec83..584a4a1a6e81 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -50,8 +50,9 @@ struct jfs_inode_info { short btindex; /* btpage entry index*/ struct inode *ipimap; /* inode map */ unsigned long cflag; /* commit flags */ + u64 agstart; /* agstart of the containing IAG */ u16 bxflag; /* xflag of pseudo buffer? */ - unchar agno; /* ag number */ + unchar pad; signed char active_ag; /* ag currently allocating from */ lid_t blid; /* lid of pseudo buffer? */ lid_t atlhead; /* anonymous tlock list head */ diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 8ea5efb5a34e..8d0c1c7c0820 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -80,7 +80,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) int log_formatted = 0; struct inode *iplist[1]; struct jfs_superblock *j_sb, *j_sb2; - uint old_agsize; + s64 old_agsize; int agsizechanged = 0; struct buffer_head *bh, *bh2; diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index adb45ec9038c..e374050a911c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -708,7 +708,13 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) if (task->tk_status < 0) { dprintk("lockd: unlock failed (err = %d)\n", -task->tk_status); - goto retry_rebind; + switch (task->tk_status) { + case -EACCES: + case -EIO: + goto die; + default: + goto retry_rebind; + } } if (status == NLM_LCK_DENIED_GRACE_PERIOD) { rpc_delay(task, NLMCLNT_GRACE_WAIT); diff --git a/fs/locks.c b/fs/locks.c index 0a4f50dfadfb..b286539d547a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -160,10 +160,28 @@ EXPORT_SYMBOL_GPL(unlock_flocks); static struct kmem_cache *filelock_cache __read_mostly; +static void locks_init_lock_always(struct file_lock *fl) +{ + fl->fl_next = NULL; + fl->fl_fasync = NULL; + fl->fl_owner = NULL; + fl->fl_pid = 0; + fl->fl_nspid = NULL; + fl->fl_file = NULL; + fl->fl_flags = 0; + fl->fl_type = 0; + fl->fl_start = fl->fl_end = 0; +} + /* Allocate an empty lock structure. */ struct file_lock *locks_alloc_lock(void) { - return kmem_cache_alloc(filelock_cache, GFP_KERNEL); + struct file_lock *fl = kmem_cache_alloc(filelock_cache, GFP_KERNEL); + + if (fl) + locks_init_lock_always(fl); + + return fl; } EXPORT_SYMBOL_GPL(locks_alloc_lock); @@ -200,17 +218,9 @@ void locks_init_lock(struct file_lock *fl) INIT_LIST_HEAD(&fl->fl_link); INIT_LIST_HEAD(&fl->fl_block); init_waitqueue_head(&fl->fl_wait); - fl->fl_next = NULL; - fl->fl_fasync = NULL; - fl->fl_owner = NULL; - fl->fl_pid = 0; - fl->fl_nspid = NULL; - fl->fl_file = NULL; - fl->fl_flags = 0; - fl->fl_type = 0; - fl->fl_start = fl->fl_end = 0; fl->fl_ops = NULL; fl->fl_lmops = NULL; + locks_init_lock_always(fl); } EXPORT_SYMBOL(locks_init_lock); diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 9ed89d1663f8..1afae26cf236 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -555,13 +555,6 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, return __logfs_create(dir, dentry, inode, target, destlen); } -static int logfs_permission(struct inode *inode, int mask, unsigned int flags) -{ - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - return generic_permission(inode, mask, flags, NULL); -} - static int logfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { @@ -820,7 +813,6 @@ const struct inode_operations logfs_dir_iops = { .mknod = logfs_mknod, .rename = logfs_rename, .rmdir = logfs_rmdir, - .permission = logfs_permission, .symlink = logfs_symlink, .unlink = logfs_unlink, }; diff --git a/fs/namei.c b/fs/namei.c index 9e425e7e6c8f..0223c41fb114 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -238,7 +238,8 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags, /* * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. + * Executable DACs are overridable for all directories and + * for non-directories that have least one exec bit set. */ if (!(mask & MAY_EXEC) || execute_ok(inode)) if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE)) @@ -1011,9 +1012,6 @@ failed: * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. - * - * Care must be taken as namespace_sem may be held (indicated by mounting_here - * being true). */ int follow_down(struct path *path) { diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index ce153a6b3aec..419119c371bf 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -259,12 +259,10 @@ static void nfs_fscache_disable_inode_cookie(struct inode *inode) dfprintk(FSCACHE, "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode)); - /* Need to invalidate any mapped pages that were read in before - * turning off the cache. + /* Need to uncache any pages attached to this inode that + * fscache knows about before turning off the cache. */ - if (inode->i_mapping && inode->i_mapping->nrpages) - invalidate_inode_pages2(inode->i_mapping); - + fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode); nfs_fscache_zap_inode_cookie(inode); } } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 144f2a3c7185..6f4850deb272 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -256,7 +256,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfs_attr_check_mountpoint(sb, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) + if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) && + !nfs_attr_use_mounted_on_fileid(fattr)) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; @@ -1294,7 +1295,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if (nfsi->npages == 0 || new_isize > cur_isize) { + if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) || + new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b9056cbe68d6..2a55347a2daa 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -45,6 +45,17 @@ static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; } +static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) +{ + if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || + (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && + ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) + return 0; + + fattr->fileid = fattr->mounted_on_fileid; + return 1; +} + struct nfs_clone_mount { const struct super_block *sb; const struct dentry *dentry; diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 426908809c97..0bafcc91c27f 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -30,6 +30,7 @@ */ #include <linux/nfs_fs.h> +#include <linux/nfs_page.h> #include "internal.h" #include "nfs4filelayout.h" @@ -552,13 +553,18 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, __func__, nfl_util, fl->num_fh, fl->first_stripe_index, fl->pattern_offset); - if (!fl->num_fh) + /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. + * Futher checking is done in filelayout_check_layout */ + if (fl->num_fh < 0 || fl->num_fh > + max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) goto out_err; - fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), - gfp_flags); - if (!fl->fh_array) - goto out_err; + if (fl->num_fh > 0) { + fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), + gfp_flags); + if (!fl->fh_array) + goto out_err; + } for (i = 0; i < fl->num_fh; i++) { /* Do we want to use a mempool here? */ @@ -661,8 +667,9 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, u64 p_stripe, r_stripe; u32 stripe_unit; - if (!pnfs_generic_pg_test(pgio, prev, req)) - return 0; + if (!pnfs_generic_pg_test(pgio, prev, req) || + !nfs_generic_pg_test(pgio, prev, req)) + return false; if (!pgio->pg_lseg) return 1; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d2c4b59c896d..5879b23e0c99 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2265,12 +2265,14 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, return nfs4_map_errors(status); } +static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); /* * Get locations and (maybe) other attributes of a referral. * Note that we'll actually follow the referral later when * we detect fsid mismatch in inode revalidation */ -static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct nfs_fattr *fattr, struct nfs_fh *fhandle) +static int nfs4_get_referral(struct inode *dir, const struct qstr *name, + struct nfs_fattr *fattr, struct nfs_fh *fhandle) { int status = -ENOMEM; struct page *page = NULL; @@ -2288,15 +2290,16 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct goto out; /* Make sure server returned a different fsid for the referral */ if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) { - dprintk("%s: server did not return a different fsid for a referral at %s\n", __func__, name->name); + dprintk("%s: server did not return a different fsid for" + " a referral at %s\n", __func__, name->name); status = -EIO; goto out; } + /* Fixup attributes for the nfs_lookup() call to nfs_fhget() */ + nfs_fixup_referral_attributes(&locations->fattr); + /* replace the lookup nfs_fattr with the locations nfs_fattr */ memcpy(fattr, &locations->fattr, sizeof(struct nfs_fattr)); - fattr->valid |= NFS_ATTR_FATTR_V4_REFERRAL; - if (!fattr->mode) - fattr->mode = S_IFDIR; memset(fhandle, 0, sizeof(struct nfs_fh)); out: if (page) @@ -4667,11 +4670,15 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, return len; } +/* + * nfs_fhget will use either the mounted_on_fileid or the fileid + */ static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr) { - if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) && - (fattr->valid & NFS_ATTR_FATTR_FSID) && - (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) + if (!(((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) || + (fattr->valid & NFS_ATTR_FATTR_FILEID)) && + (fattr->valid & NFS_ATTR_FATTR_FSID) && + (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL))) return; fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE | @@ -4686,7 +4693,6 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs_server *server = NFS_SERVER(dir); u32 bitmask[2] = { [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - [1] = FATTR4_WORD1_MOUNTED_ON_FILEID, }; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), @@ -4705,11 +4711,18 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, int status; dprintk("%s: start\n", __func__); + + /* Ask for the fileid of the absent filesystem if mounted_on_fileid + * is not supported */ + if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) + bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + else + bitmask[0] |= FATTR4_WORD0_FILEID; + nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; fs_locations->nlocations = 0; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); - nfs_fixup_referral_attributes(&fs_locations->fattr); dprintk("%s: returned status = %d\n", __func__, status); return status; } @@ -5098,7 +5111,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) if (mxresp_sz == 0) mxresp_sz = NFS_MAX_FILE_IO_SIZE; /* Fore channel attributes */ - args->fc_attrs.headerpadsz = 0; args->fc_attrs.max_rqst_sz = mxrqst_sz; args->fc_attrs.max_resp_sz = mxresp_sz; args->fc_attrs.max_ops = NFS4_MAX_OPS; @@ -5111,7 +5123,6 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args) args->fc_attrs.max_ops, args->fc_attrs.max_reqs); /* Back channel attributes */ - args->bc_attrs.headerpadsz = 0; args->bc_attrs.max_rqst_sz = PAGE_SIZE; args->bc_attrs.max_resp_sz = PAGE_SIZE; args->bc_attrs.max_resp_sz_cached = 0; @@ -5131,8 +5142,6 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args struct nfs4_channel_attrs *sent = &args->fc_attrs; struct nfs4_channel_attrs *rcvd = &session->fc_attrs; - if (rcvd->headerpadsz > sent->headerpadsz) - return -EINVAL; if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; /* @@ -5697,6 +5706,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct nfs_server *server; + struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; dprintk("--> %s\n", __func__); @@ -5708,16 +5718,15 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) nfs_restart_rpc(task, lrp->clp); return; } + spin_lock(&lo->plh_inode->i_lock); if (task->tk_status == 0) { - struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout; - if (lrp->res.lrs_present) { - spin_lock(&lo->plh_inode->i_lock); pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - spin_unlock(&lo->plh_inode->i_lock); } else BUG_ON(!list_empty(&lo->plh_segs)); } + lo->plh_block_lgets--; + spin_unlock(&lo->plh_inode->i_lock); dprintk("<-- %s\n", __func__); } diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index d869a5e5464b..6870bc61ceec 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -255,7 +255,7 @@ static int nfs4_stat_to_errno(int); #define decode_fs_locations_maxsz \ (0) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) -#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN))) +#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) #if defined(CONFIG_NFS_V4_1) #define NFS4_MAX_MACHINE_NAME_LEN (64) @@ -1725,7 +1725,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(args->flags); /*flags */ /* Fore Channel */ - *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(0); /* header padding size */ *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ *p++ = cpu_to_be32(max_resp_sz_cached); /* Max resp sz cached */ @@ -1734,7 +1734,7 @@ static void encode_create_session(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ /* Back Channel */ - *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(0); /* header padding size */ *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ @@ -3098,7 +3098,7 @@ out_overflow: return -EIO; } -static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) +static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) { __be32 *p; @@ -3109,7 +3109,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap) if (unlikely(!p)) goto out_overflow; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; - return -be32_to_cpup(p); + *res = -be32_to_cpup(p); } return 0; out_overflow: @@ -4070,6 +4070,7 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, int status; umode_t fmode = 0; uint32_t type; + int32_t err; status = decode_attr_type(xdr, bitmap, &type); if (status < 0) @@ -4095,13 +4096,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_error(xdr, bitmap); - if (status == -NFS4ERR_WRONGSEC) { - nfs_fixup_secinfo_attributes(fattr, fh); - status = 0; - } + err = 0; + status = decode_attr_error(xdr, bitmap, &err); if (status < 0) goto xdr_error; + if (err == -NFS4ERR_WRONGSEC) + nfs_fixup_secinfo_attributes(fattr, fh); status = decode_attr_filehandle(xdr, bitmap, fh); if (status < 0) @@ -4997,12 +4997,14 @@ static int decode_chan_attrs(struct xdr_stream *xdr, struct nfs4_channel_attrs *attrs) { __be32 *p; - u32 nr_attrs; + u32 nr_attrs, val; p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) goto out_overflow; - attrs->headerpadsz = be32_to_cpup(p++); + val = be32_to_cpup(p++); /* headerpadsz */ + if (val) + return -EINVAL; /* no support for header padding yet */ attrs->max_rqst_sz = be32_to_cpup(p++); attrs->max_resp_sz = be32_to_cpup(p++); attrs->max_resp_sz_cached = be32_to_cpup(p++); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 9cf208df1f25..8ff2ea3f10ef 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -108,7 +108,6 @@ _dev_list_add(const struct nfs_server *nfss, de = n; } - atomic_inc(&de->id_node.ref); return de; } @@ -1001,6 +1000,9 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, if (!pnfs_generic_pg_test(pgio, prev, req)) return false; + if (pgio->pg_lseg == NULL) + return true; + return pgio->pg_count + req->wb_bytes <= OBJIO_LSEG(pgio->pg_lseg)->max_io_size; } diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index dc3956c0de80..1d06f8e2adea 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -291,7 +291,7 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) struct nfs_read_data *rdata; state->status = status; - dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof); + dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); rdata = state->rpcdata; rdata->task.tk_status = status; if (status >= 0) { diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7913961aff22..009855716286 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -204,7 +204,7 @@ nfs_wait_on_request(struct nfs_page *req) TASK_UNINTERRUPTIBLE); } -static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) +bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req) { /* * FIXME: ideally we should be able to coalesce all requests @@ -218,6 +218,7 @@ static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_p return desc->pg_count + req->wb_bytes <= desc->pg_bsize; } +EXPORT_SYMBOL_GPL(nfs_generic_pg_test); /** * nfs_pageio_init - initialise a page io descriptor diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8c1309d852a6..29c0ca7fc347 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -634,14 +634,16 @@ _pnfs_return_layout(struct inode *ino) spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) { + if (!lo) { spin_unlock(&ino->i_lock); - dprintk("%s: no layout segments to return\n", __func__); - goto out; + dprintk("%s: no layout to return\n", __func__); + return status; } stateid = nfsi->layout->plh_stateid; /* Reference matched in nfs4_layoutreturn_release */ get_layout_hdr(lo); + mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + lo->plh_block_lgets++; spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); @@ -650,6 +652,9 @@ _pnfs_return_layout(struct inode *ino) lrp = kzalloc(sizeof(*lrp), GFP_KERNEL); if (unlikely(lrp == NULL)) { status = -ENOMEM; + set_bit(NFS_LAYOUT_RW_FAILED, &lo->plh_flags); + set_bit(NFS_LAYOUT_RO_FAILED, &lo->plh_flags); + put_layout_hdr(lo); goto out; } @@ -887,7 +892,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, ret = get_lseg(lseg); break; } - if (cmp_layout(range, &lseg->pls_range) > 0) + if (lseg->pls_range.offset > range->offset) break; } @@ -1059,23 +1064,36 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, gfp_flags = GFP_NOFS; } - if (pgio->pg_count == prev->wb_bytes) { + if (pgio->pg_lseg == NULL) { + if (pgio->pg_count != prev->wb_bytes) + return true; /* This is first coelesce call for a series of nfs_pages */ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, prev->wb_context, - req_offset(req), + req_offset(prev), pgio->pg_count, access_type, gfp_flags); - return true; + if (pgio->pg_lseg == NULL) + return true; } - if (pgio->pg_lseg && - req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset, - pgio->pg_lseg->pls_range.length)) - return false; - - return true; + /* + * Test if a nfs_page is fully contained in the pnfs_layout_range. + * Note that this test makes several assumptions: + * - that the previous nfs_page in the struct nfs_pageio_descriptor + * is known to lie within the range. + * - that the nfs_page being tested is known to be contiguous with the + * previous nfs_page. + * - Layout ranges are page aligned, so we only have to test the + * start offset of the request. + * + * Please also note that 'end_offset' is actually the offset of the + * first byte that lies outside the pnfs_layout_range. FIXME? + * + */ + return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset, + pgio->pg_lseg->pls_range.length); } EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 48d0a8e4d062..96bf4e6f45be 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -186,6 +186,7 @@ int pnfs_ld_read_done(struct nfs_read_data *); /* pnfs_dev.c */ struct nfs4_deviceid_node { struct hlist_node node; + struct hlist_node tmpnode; const struct pnfs_layoutdriver_type *ld; const struct nfs_client *nfs_client; struct nfs4_deviceid deviceid; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index c65e133ce9c0..f0f8e1e22f6c 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -174,6 +174,7 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, const struct nfs4_deviceid *id) { INIT_HLIST_NODE(&d->node); + INIT_HLIST_NODE(&d->tmpnode); d->ld = ld; d->nfs_client = nfs_client; d->deviceid = *id; @@ -208,6 +209,7 @@ nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); spin_unlock(&nfs4_deviceid_lock); + atomic_inc(&new->ref); return new; } @@ -238,24 +240,29 @@ static void _deviceid_purge_client(const struct nfs_client *clp, long hash) { struct nfs4_deviceid_node *d; - struct hlist_node *n, *next; + struct hlist_node *n; HLIST_HEAD(tmp); + spin_lock(&nfs4_deviceid_lock); rcu_read_lock(); hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); - hlist_add_head(&d->node, &tmp); + hlist_add_head(&d->tmpnode, &tmp); } rcu_read_unlock(); + spin_unlock(&nfs4_deviceid_lock); if (hlist_empty(&tmp)) return; synchronize_rcu(); - hlist_for_each_entry_safe(d, n, next, &tmp, node) + while (!hlist_empty(&tmp)) { + d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); + hlist_del(&d->tmpnode); if (atomic_dec_and_test(&d->ref)) d->ld->free_deviceid_node(d); + } } void @@ -263,8 +270,8 @@ nfs4_deviceid_purge_client(const struct nfs_client *clp) { long h; - spin_lock(&nfs4_deviceid_lock); + if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_MDS)) + return; for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++) _deviceid_purge_client(clp, h); - spin_unlock(&nfs4_deviceid_lock); } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 18b3e8975fe0..fbb2a5ef5817 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -82,6 +82,7 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS + select CRYPTO help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1f5eae40f34e..2b1449dd2f49 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -13,6 +13,7 @@ #include <linux/lockd/lockd.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/gss_api.h> +#include <linux/sunrpc/gss_krb5_enctypes.h> #include "idmap.h" #include "nfsd.h" @@ -189,18 +190,10 @@ static struct file_operations export_features_operations = { .release = single_release, }; -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) static int supported_enctypes_show(struct seq_file *m, void *v) { - struct gss_api_mech *k5mech; - - k5mech = gss_mech_get_by_name("krb5"); - if (k5mech == NULL) - goto out; - if (k5mech->gm_upcall_enctypes != NULL) - seq_printf(m, k5mech->gm_upcall_enctypes); - gss_mech_put(k5mech); -out: + seq_printf(m, KRB5_SUPPORTED_ENCTYPES); return 0; } @@ -215,7 +208,7 @@ static struct file_operations supported_enctypes_ops = { .llseek = seq_lseek, .release = single_release, }; -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); @@ -1427,9 +1420,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, -#ifdef CONFIG_SUNRPC_GSS +#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, -#endif /* CONFIG_SUNRPC_GSS */ +#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d5718273bb32..fd0acca5370a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -696,7 +696,15 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ +static int nfsd_open_break_lease(struct inode *inode, int access) +{ + unsigned int mode; + if (access & NFSD_MAY_NOT_BREAK_LEASE) + return 0; + mode = (access & NFSD_MAY_WRITE) ? O_WRONLY : O_RDONLY; + return break_lease(inode, mode | O_NONBLOCK); +} /* * Open an existing file or directory. @@ -744,12 +752,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!inode->i_fop) goto out; - /* - * Check to see if there are any leases on this file. - * This may block while leases are broken. - */ - if (!(access & NFSD_MAY_NOT_BREAK_LEASE)) - host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? O_WRONLY : 0)); + host_err = nfsd_open_break_lease(inode, access); if (host_err) /* NOMEM or WOULDBLOCK */ goto out_nfserr; @@ -1660,8 +1663,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (!dold->d_inode) goto out_drop_write; host_err = nfsd_break_lease(dold->d_inode); - if (host_err) + if (host_err) { + err = nfserrno(host_err); goto out_drop_write; + } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index b954878ad6ce..b9b45fc2903e 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -801,12 +801,7 @@ out_err: int nilfs_permission(struct inode *inode, int mask, unsigned int flags) { - struct nilfs_root *root; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - - root = NILFS_I(inode)->i_root; + struct nilfs_root *root = NILFS_I(inode)->i_root; if ((mask & MAY_WRITE) && root && root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ diff --git a/fs/omfs/file.c b/fs/omfs/file.c index d738a7e493dd..2c6d95257a4d 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -4,7 +4,6 @@ * Released under GPL v2. */ -#include <linux/version.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/buffer_head.h> diff --git a/fs/proc/base.c b/fs/proc/base.c index 14def991d9dd..fc5bc2767692 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2169,11 +2169,7 @@ static const struct file_operations proc_fd_operations = { */ static int proc_fd_permission(struct inode *inode, int mask, unsigned int flags) { - int rv; - - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - rv = generic_permission(inode, mask, flags, NULL); + int rv = generic_permission(inode, mask, flags, NULL); if (rv == 0) return 0; if (task_pid(current) == proc_pid(inode)) @@ -2712,6 +2708,9 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole) struct task_io_accounting acct = task->ioac; unsigned long flags; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + return -EACCES; + if (whole && lock_task_sighand(task, &flags)) { struct task_struct *t = task; @@ -2843,7 +2842,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tgid_io_accounting), + INF("io", S_IRUSR, proc_tgid_io_accounting), #endif #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), @@ -3185,7 +3184,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, proc_tid_io_accounting), + INF("io", S_IRUSR, proc_tid_io_accounting), #endif #ifdef CONFIG_HARDWALL INF("hardwall", S_IRUGO, proc_pid_hardwall), diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f50133c11c24..d167de365a8d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -304,9 +304,6 @@ static int proc_sys_permission(struct inode *inode, int mask,unsigned int flags) struct ctl_table *table; int error; - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - /* Executable files are not allowed under /proc/sys/ */ if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) return -EACCES; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e8a62f41b458..d78089690965 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -954,8 +954,6 @@ static int xattr_mount_check(struct super_block *s) int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; /* * We don't do permission checks on the internal objects. * Permissions are determined by the "owning" object. diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index f0511e816967..eed99428f104 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -27,14 +27,18 @@ static unsigned long romfs_get_unmapped_area(struct file *file, { struct inode *inode = file->f_mapping->host; struct mtd_info *mtd = inode->i_sb->s_mtd; - unsigned long isize, offset; + unsigned long isize, offset, maxpages, lpages; if (!mtd) goto cant_map_directly; + /* the mapping mustn't extend beyond the EOF */ + lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; isize = i_size_read(inode); offset = pgoff << PAGE_SHIFT; - if (offset > isize || len > isize || offset > isize - len) + + maxpages = (isize + PAGE_SIZE - 1) >> PAGE_SHIFT; + if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) return (unsigned long) -EINVAL; /* we need to call down to the MTD layer to do the actual mapping */ diff --git a/fs/timerfd.c b/fs/timerfd.c index f67acbdda5e8..dffeb3795af1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -61,7 +61,9 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) /* * Called when the clock was set to cancel the timers in the cancel - * list. + * list. This will wake up processes waiting on these timers. The + * wake-up requires ctx->ticks to be non zero, therefore we increment + * it before calling wake_up_locked(). */ void timerfd_clock_was_set(void) { @@ -76,6 +78,7 @@ void timerfd_clock_was_set(void) spin_lock_irqsave(&ctx->wqh.lock, flags); if (ctx->moffs.tv64 != moffs.tv64) { ctx->moffs.tv64 = KTIME_MAX; + ctx->ticks++; wake_up_locked(&ctx->wqh); } spin_unlock_irqrestore(&ctx->wqh.lock, flags); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 8c892c2d5300..529be0582029 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2146,6 +2146,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(sb)) { err = PTR_ERR(sb); kfree(c); + goto out_close; } if (sb->s_root) { diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index c86375378810..01d2072fb6d4 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -490,6 +490,13 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) args.whichfork = XFS_ATTR_FORK; /* + * we have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail. + */ + args.op_flags = XFS_DA_OP_OKNOENT; + + /* * Attach the dquots to the inode. */ error = xfs_qm_dqattach(dp, 0); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index cb9b6d1469f7..3631783b2b53 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -253,16 +253,21 @@ xfs_iget_cache_hit( rcu_read_lock(); spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~XFS_INEW; - ip->i_flags |= XFS_IRECLAIMABLE; - __xfs_inode_set_reclaim_tag(pag, ip); + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); trace_xfs_iget_reclaim_fail(ip); goto out_error; } spin_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; ip->i_flags |= XFS_INEW; __xfs_inode_clear_reclaim_tag(mp, pag, ip); inode->i_state = I_NEW; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3ae6d58e5473..964cfea77686 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -384,6 +384,16 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) #define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ /* + * Per-lifetime flags need to be reset when re-using a reclaimable inode during + * inode lookup. Thi prevents unintended behaviour on the new inode from + * ocurring. + */ +#define XFS_IRECLAIM_RESET_FLAGS \ + (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \ + XFS_IFILESTREAM); + +/* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) * 1<<16 - 1<<32-1 -- lockdep annotation (integers) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 09983a3344a5..b1e88d56069c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -681,15 +681,15 @@ xfs_inode_item_unlock( * where the cluster buffer may be unpinned before the inode is inserted into * the AIL during transaction committed processing. If the buffer is unpinned * before the inode item has been committed and inserted, then it is possible - * for the buffer to be written and IO completions before the inode is inserted + * for the buffer to be written and IO completes before the inode is inserted * into the AIL. In that case, we'd be inserting a clean, stale inode into the * AIL which will never get removed. It will, however, get reclaimed which * triggers an assert in xfs_inode_free() complaining about freein an inode * still in the AIL. * - * To avoid this, return a lower LSN than the one passed in so that the - * transaction committed code will not move the inode forward in the AIL but - * will still unpin it properly. + * To avoid this, just unpin the inode directly and return a LSN of -1 so the + * transaction committed code knows that it does not need to do any further + * processing on the item. */ STATIC xfs_lsn_t xfs_inode_item_committed( @@ -699,8 +699,10 @@ xfs_inode_item_committed( struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - if (xfs_iflags_test(ip, XFS_ISTALE)) - return lsn - 1; + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_inode_item_unpin(lip, 0); + return -1; + } return lsn; } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 7c7bc2b786bd..c83f63b33aae 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1361,7 +1361,7 @@ xfs_trans_item_committed( lip->li_flags |= XFS_LI_ABORTED; item_lsn = IOP_COMMITTED(lip, commit_lsn); - /* If the committed routine returns -1, item has been freed. */ + /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) return; @@ -1474,7 +1474,7 @@ xfs_trans_committed_bulk( lip->li_flags |= XFS_LI_ABORTED; item_lsn = IOP_COMMITTED(lip, commit_lsn); - /* item_lsn of -1 means the item was freed */ + /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) continue; diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index b7a5fe7c52c8..619720705bc6 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -960,8 +960,11 @@ xfs_release( * be exposed to that problem. */ truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); - if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) - xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) + xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); + } } if (ip->i_d.di_nlink == 0) |