diff options
author | Ingo Molnar <mingo@kernel.org> | 2019-12-25 10:41:37 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2019-12-25 10:41:37 +0100 |
commit | 1e5f8a308551b9816588e12bb795aeadebe37c4a (patch) | |
tree | bd71fc796fed24a3b7cc99df4a1d1bdaecc2b387 /fs | |
parent | a5e37de90e67ac1072a9a44bd0cec9f5e98ded08 (diff) | |
parent | 46cf053efec6a3a5f343fead837777efe8252a46 (diff) | |
download | linux-1e5f8a308551b9816588e12bb795aeadebe37c4a.tar.bz2 |
Merge tag 'v5.5-rc3' into sched/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs')
179 files changed, 4323 insertions, 2049 deletions
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 4150280509ff..7503899c0a1b 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -136,6 +136,9 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr ASSERTCMP(d_inode(dentry), ==, NULL); + if (flags & LOOKUP_CREATE) + return ERR_PTR(-EOPNOTSUPP); + if (dentry->d_name.len >= AFSNAMEMAX) { _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index f532d6d3bd28..79bc5f1338ed 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -126,7 +126,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (src_as->cell) ctx->cell = afs_get_cell(src_as->cell); - if (size > PAGE_SIZE - 1) + if (size < 2 || size > PAGE_SIZE - 1) return -EINVAL; page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); @@ -140,7 +140,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) } buf = kmap(page); - ret = vfs_parse_fs_string(fc, "source", buf, size); + ret = -EINVAL; + if (buf[size - 1] == '.') + ret = vfs_parse_fs_string(fc, "source", buf, size - 1); kunmap(page); put_page(page); if (ret < 0) diff --git a/fs/afs/proc.c b/fs/afs/proc.c index fba2ec3a3a9c..468e1713bce1 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -213,13 +213,14 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) /* Display header on line 1 */ if (v == &cell->proc_volumes) { - seq_puts(m, "USE VID TY\n"); + seq_puts(m, "USE VID TY NAME\n"); return 0; } - seq_printf(m, "%3d %08llx %s\n", + seq_printf(m, "%3d %08llx %s %s\n", atomic_read(&vol->usage), vol->vid, - afs_vol_types[vol->type]); + afs_vol_types[vol->type], + vol->name); return 0; } diff --git a/fs/afs/server.c b/fs/afs/server.c index 1686bf188ccd..b7f3cb2130ca 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -32,18 +32,11 @@ static void afs_dec_servers_outstanding(struct afs_net *net) struct afs_server *afs_find_server(struct afs_net *net, const struct sockaddr_rxrpc *srx) { - const struct sockaddr_in6 *a = &srx->transport.sin6, *b; const struct afs_addr_list *alist; struct afs_server *server = NULL; unsigned int i; - bool ipv6 = true; int seq = 0, diff; - if (srx->transport.sin6.sin6_addr.s6_addr32[0] == 0 || - srx->transport.sin6.sin6_addr.s6_addr32[1] == 0 || - srx->transport.sin6.sin6_addr.s6_addr32[2] == htonl(0xffff)) - ipv6 = false; - rcu_read_lock(); do { @@ -52,7 +45,8 @@ struct afs_server *afs_find_server(struct afs_net *net, server = NULL; read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - if (ipv6) { + if (srx->transport.family == AF_INET6) { + const struct sockaddr_in6 *a = &srx->transport.sin6, *b; hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { alist = rcu_dereference(server->addresses); for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { @@ -68,15 +62,16 @@ struct afs_server *afs_find_server(struct afs_net *net, } } } else { + const struct sockaddr_in *a = &srx->transport.sin, *b; hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { alist = rcu_dereference(server->addresses); for (i = 0; i < alist->nr_ipv4; i++) { - b = &alist->addrs[i].transport.sin6; - diff = ((u16 __force)a->sin6_port - - (u16 __force)b->sin6_port); + b = &alist->addrs[i].transport.sin; + diff = ((u16 __force)a->sin_port - + (u16 __force)b->sin_port); if (diff == 0) - diff = ((u32 __force)a->sin6_addr.s6_addr32[3] - - (u32 __force)b->sin6_addr.s6_addr32[3]); + diff = ((u32 __force)a->sin_addr.s_addr - + (u32 __force)b->sin_addr.s_addr); if (diff == 0) goto found; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 488641b1a418..7f8a9b3137bf 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -404,6 +404,7 @@ static int afs_test_super(struct super_block *sb, struct fs_context *fc) return (as->net_ns == fc->net_ns && as->volume && as->volume->vid == ctx->volume->vid && + as->cell == ctx->cell && !as->dyn_root); } @@ -448,7 +449,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) /* allocate the root inode and dentry */ if (as->dyn_root) { inode = afs_iget_pseudo_dir(sb, true); - sb->s_flags |= SB_RDONLY; } else { sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 8bcec8dcabb6..054f97b07754 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -63,7 +63,7 @@ struct autofs_info { struct autofs_sb_info *sbi; unsigned long last_used; - atomic_t count; + int count; kuid_t uid; kgid_t gid; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 91f5787dae7c..a1c7701007e7 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -211,7 +211,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, } } else { struct autofs_info *ino = autofs_dentry_ino(p); - unsigned int ino_count = atomic_read(&ino->count); + unsigned int ino_count = READ_ONCE(ino->count); /* allow for dget above and top is already dgot */ if (p == top) @@ -379,7 +379,7 @@ static struct dentry *should_expire(struct dentry *dentry, /* Not a forced expire? */ if (!(how & AUTOFS_EXP_FORCED)) { /* ref-walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; + ino_count = READ_ONCE(ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; } @@ -396,7 +396,7 @@ static struct dentry *should_expire(struct dentry *dentry, /* Not a forced expire? */ if (!(how & AUTOFS_EXP_FORCED)) { /* ref-walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; + ino_count = READ_ONCE(ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 29abafc0ce31..5aaa1732bf1e 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -569,10 +569,9 @@ static int autofs_dir_symlink(struct inode *dir, d_add(dentry, inode); dget(dentry); - atomic_inc(&ino->count); + ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_inc(&p_ino->count); + p_ino->count++; dir->i_mtime = current_time(dir); @@ -610,11 +609,9 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_dec(&p_ino->count); - } + ino->count--; + p_ino = autofs_dentry_ino(dentry->d_parent); + p_ino->count--; dput(ino->dentry); d_inode(dentry)->i_size = 0; @@ -660,7 +657,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry) static void autofs_clear_leaf_automount_flags(struct dentry *dentry) { - struct list_head *d_child; struct dentry *parent; /* flags for dentrys in the root are handled elsewhere */ @@ -673,10 +669,7 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) /* only consider parents below dentrys in the root */ if (IS_ROOT(parent->d_parent)) return; - d_child = &dentry->d_child; - /* Set parent managed if it's becoming empty */ - if (d_child->next == &parent->d_subdirs && - d_child->prev == &parent->d_subdirs) + if (autofs_dentry_ino(parent)->count == 2) managed_dentry_set_managed(parent); } @@ -698,11 +691,10 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->flags & AUTOFS_SBI_CATATONIC) return -EACCES; - spin_lock(&sbi->lookup_lock); - if (!simple_empty(dentry)) { - spin_unlock(&sbi->lookup_lock); + if (ino->count != 1) return -ENOTEMPTY; - } + + spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -710,11 +702,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (sbi->version < 5) autofs_clear_leaf_automount_flags(dentry); - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) - atomic_dec(&p_ino->count); - } + ino->count--; + p_ino = autofs_dentry_ino(dentry->d_parent); + p_ino->count--; dput(ino->dentry); d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); @@ -760,10 +750,9 @@ static int autofs_dir_mkdir(struct inode *dir, autofs_set_leaf_automount_flags(dentry); dget(dentry); - atomic_inc(&ino->count); + ino->count++; p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_inc(&p_ino->count); + p_ino->count++; inc_nlink(dir); dir->i_mtime = current_time(dir); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5372eabd276a..ecd8d2698515 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -404,6 +404,17 @@ static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) ELF_PAGESTART(cmds[first_idx].p_vaddr); } +static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) +{ + ssize_t rv; + + rv = kernel_read(file, buf, len, &pos); + if (unlikely(rv != len)) { + return (rv < 0) ? rv : -EIO; + } + return 0; +} + /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded @@ -418,7 +429,6 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, { struct elf_phdr *elf_phdata = NULL; int retval, err = -1; - loff_t pos = elf_ex->e_phoff; unsigned int size; /* @@ -439,9 +449,9 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, goto out; /* Read in the program headers */ - retval = kernel_read(elf_file, elf_phdata, size, &pos); - if (retval != size) { - err = (retval < 0) ? retval : -EIO; + retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff); + if (retval < 0) { + err = retval; goto out; } @@ -544,7 +554,7 @@ static inline int make_prot(u32 p_flags) an ELF header */ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, - struct file *interpreter, unsigned long *interp_map_addr, + struct file *interpreter, unsigned long no_base, struct elf_phdr *interp_elf_phdata) { struct elf_phdr *eppnt; @@ -590,8 +600,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type, total_size); total_size = 0; - if (!*interp_map_addr) - *interp_map_addr = map_addr; error = map_addr; if (BAD_ADDR(map_addr)) goto out; @@ -722,7 +730,6 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_ppnt = elf_phdata; for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { char *elf_interpreter; - loff_t pos; if (elf_ppnt->p_type != PT_INTERP) continue; @@ -740,14 +747,10 @@ static int load_elf_binary(struct linux_binprm *bprm) if (!elf_interpreter) goto out_free_ph; - pos = elf_ppnt->p_offset; - retval = kernel_read(bprm->file, elf_interpreter, - elf_ppnt->p_filesz, &pos); - if (retval != elf_ppnt->p_filesz) { - if (retval >= 0) - retval = -EIO; + retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz, + elf_ppnt->p_offset); + if (retval < 0) goto out_free_interp; - } /* make sure path is NULL terminated */ retval = -ENOEXEC; if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') @@ -766,14 +769,10 @@ static int load_elf_binary(struct linux_binprm *bprm) would_dump(bprm, interpreter); /* Get the exec headers */ - pos = 0; - retval = kernel_read(interpreter, &loc->interp_elf_ex, - sizeof(loc->interp_elf_ex), &pos); - if (retval != sizeof(loc->interp_elf_ex)) { - if (retval >= 0) - retval = -EIO; + retval = elf_read(interpreter, &loc->interp_elf_ex, + sizeof(loc->interp_elf_ex), 0); + if (retval < 0) goto out_free_dentry; - } break; @@ -1054,11 +1053,8 @@ out_free_interp: } if (interpreter) { - unsigned long interp_map_addr = 0; - elf_entry = load_elf_interp(&loc->interp_elf_ex, interpreter, - &interp_map_addr, load_bias, interp_elf_phdata); if (!IS_ERR((void *)elf_entry)) { /* @@ -1179,11 +1175,10 @@ static int load_elf_library(struct file *file) unsigned long elf_bss, bss, len; int retval, error, i, j; struct elfhdr elf_ex; - loff_t pos = 0; error = -ENOEXEC; - retval = kernel_read(file, &elf_ex, sizeof(elf_ex), &pos); - if (retval != sizeof(elf_ex)) + retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0); + if (retval < 0) goto out; if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) @@ -1208,9 +1203,8 @@ static int load_elf_library(struct file *file) eppnt = elf_phdata; error = -ENOEXEC; - pos = elf_ex.e_phoff; - retval = kernel_read(file, eppnt, j, &pos); - if (retval != j) + retval = elf_read(file, eppnt, j, elf_ex.e_phoff); + if (retval < 0) goto out_free_ph; for (j = 0, i = 0; i<elf_ex.e_phnum; i++) diff --git a/fs/block_dev.c b/fs/block_dev.c index ee63c2732fa2..69bf2fb6f7cd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1531,7 +1531,7 @@ rescan: ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) goto rescan; - } else { + } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 75b6d10c9845..575636f6491e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -7,6 +7,7 @@ config BTRFS_FS select LIBCRC32C select CRYPTO_XXHASH select CRYPTO_SHA256 + select CRYPTO_BLAKE2B select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b6e86aaf2e1..24658b5a5787 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -379,7 +379,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); tm = rb_entry(node, struct tree_mod_elem, node); - if (tm->seq > min_seq) + if (tm->seq >= min_seq) continue; rb_erase(node, tm_root); kfree(tm); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b2e8fd8a8e59..54efb21c2727 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2787,7 +2787,7 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); + struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 153f71a5bba9..274318e9114e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1869,8 +1869,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, btrfs_pin_extent(fs_info, head->bytenr, head->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, fs_info, head->bytenr, - head->num_bytes); + ret = btrfs_del_csums(trans, fs_info->csum_root, + head->bytenr, head->num_bytes); } } @@ -3175,7 +3175,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, info, bytenr, num_bytes); + ret = btrfs_del_csums(trans, info->csum_root, bytenr, + num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3799,6 +3800,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, u64 flags, int delalloc) { int ret = 0; + int cache_block_group_error = 0; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group *block_group = NULL; struct find_free_extent_ctl ffe_ctl = {0}; @@ -3958,7 +3960,20 @@ have_block_group: if (unlikely(!ffe_ctl.cached)) { ffe_ctl.have_caching_bg = true; ret = btrfs_cache_block_group(block_group, 0); - BUG_ON(ret < 0); + + /* + * If we get ENOMEM here or something else we want to + * try other block groups, because it may not be fatal. + * However if we can't find anything else we need to + * save our return here so that we return the actual + * error that caused problems, not ENOSPC. + */ + if (ret < 0) { + if (!cache_block_group_error) + cache_block_group_error = ret; + ret = 0; + goto loop; + } ret = 0; } @@ -4045,7 +4060,7 @@ loop: if (ret > 0) goto search; - if (ret == -ENOSPC) { + if (ret == -ENOSPC && !cache_block_group_error) { /* * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. @@ -4056,6 +4071,8 @@ loop: space_info->max_extent_size = ffe_ctl.max_extent_size; spin_unlock(&space_info->lock); ins->offset = ffe_ctl.max_extent_size; + } else if (ret == -ENOSPC) { + ret = cache_block_group_error; } return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb8bd0258360..2f4802f405a2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5074,12 +5074,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return eb; eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) - return NULL; + return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: ret = radix_tree_preload(GFP_NOFS); - if (ret) + if (ret) { + exists = ERR_PTR(ret); goto free_eb; + } spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, start >> PAGE_SHIFT, eb); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 3270a40b0777..b1bfdc5c1387 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -590,9 +590,9 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, * range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) + struct btrfs_root *root, u64 bytenr, u64 len) { - struct btrfs_root *root = fs_info->csum_root; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; struct btrfs_key key; u64 end_byte = bytenr + len; @@ -602,6 +602,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int blocksize_bits = fs_info->sb->s_blocksize_bits; + ASSERT(root == fs_info->csum_root || + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0cb43b682789..8d47c76b7bd1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2599,8 +2599,8 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - u64 clone_len = drop_end - cur_offset; + if (clone_info && drop_end > clone_info->file_offset) { + u64 clone_len = drop_end - clone_info->file_offset; ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, clone_len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 56032c518b26..e3c76645cad7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5728,7 +5728,6 @@ static void inode_tree_add(struct inode *inode) static void inode_tree_del(struct inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int empty = 0; @@ -5741,7 +5740,6 @@ static void inode_tree_del(struct inode *inode) spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { - synchronize_srcu(&fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); @@ -9556,9 +9554,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, btrfs_init_log_ctx(&ctx_dest, new_inode); /* close the race window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&fs_info->subvol_sem); - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + if (old_ino == BTRFS_FIRST_FREE_OBJECTID || + new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* @@ -9792,9 +9789,8 @@ out_fail: ret = ret ? ret : ret2; } out_notrans: - if (new_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&fs_info->subvol_sem); - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (new_ino == BTRFS_FIRST_FREE_OBJECTID || + old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); ASSERT(list_empty(&ctx_root.list)); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a1ee0b775e65..18e328ce4b54 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -704,11 +704,17 @@ static noinline int create_subvol(struct inode *dir, btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); ret = btrfs_update_inode(trans, root, dir); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, btrfs_ino(BTRFS_I(dir)), index, name, namelen); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_uuid_tree_add(trans, root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); @@ -3720,24 +3726,18 @@ process_slot: ret = 0; if (last_dest_end < destoff + len) { - struct btrfs_clone_extent_info clone_info = { 0 }; /* - * We have an implicit hole (NO_HOLES feature is enabled) that - * fully or partially overlaps our cloning range at its end. + * We have an implicit hole that fully or partially overlaps our + * cloning range at its end. This means that we either have the + * NO_HOLES feature enabled or the implicit hole happened due to + * mixing buffered and direct IO writes against this file. */ btrfs_release_path(path); path->leave_spinning = 0; - /* - * We are dealing with a hole and our clone_info already has a - * disk_offset of 0, we only need to fill the data length and - * file offset. - */ - clone_info.data_len = destoff + len - last_dest_end; - clone_info.file_offset = last_dest_end; ret = btrfs_punch_hole_range(inode, path, last_dest_end, destoff + len - 1, - &clone_info, &trans); + NULL, &trans); if (ret) goto out; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 93aeb2e539a4..d4282e12f2a6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3232,12 +3232,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup is not enabled"); + "qgroup rescan init failed, qgroup rescan is not queued"); ret = -EINVAL; } else if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) { btrfs_warn(fs_info, - "qgroup rescan init failed, qgroup rescan is not queued"); + "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d897a8e5e430..c58245797f30 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4552,6 +4552,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); + list_add_tail(&reloc_root->root_list, &reloc_roots); goto out_free; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ae2db5eb1549..091e5bc8c7ea 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7084,12 +7084,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) spin_unlock(&send_root->root_item_lock); /* - * This is done when we lookup the root, it should already be complete - * by the time we get here. - */ - WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE); - - /* * Userspace tools do the checks and warn the user if it's * not RO. */ diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 1a846bf6e197..914eea5ba6a7 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -452,9 +452,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->fs_info->tree_root = root; root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 09aaca1efd62..ac035a6fa003 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -484,9 +484,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) * *cough*backref walking code*cough* */ root->node = alloc_test_extent_buffer(root->fs_info, nodesize); - if (!root->node) { + if (IS_ERR(root->node)) { test_err("couldn't allocate dummy buffer"); - ret = -ENOMEM; + ret = PTR_ERR(root->node); goto out; } btrfs_set_header_level(root->node, 0); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 493d4d9e0f79..97f3520b8d98 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -227,7 +227,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, */ if (item_size < BTRFS_FILE_EXTENT_INLINE_DATA_START) { file_extent_err(leaf, slot, - "invalid item size, have %u expect [%lu, %u)", + "invalid item size, have %u expect [%zu, %u)", item_size, BTRFS_FILE_EXTENT_INLINE_DATA_START, SZ_4K); return -EUCLEAN; @@ -332,7 +332,7 @@ static int check_extent_data_item(struct extent_buffer *leaf, } static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, - int slot) + int slot, struct btrfs_key *prev_key) { struct btrfs_fs_info *fs_info = leaf->fs_info; u32 sectorsize = fs_info->sectorsize; @@ -356,6 +356,20 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, btrfs_item_size_nr(leaf, slot), csumsize); return -EUCLEAN; } + if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) { + u64 prev_csum_end; + u32 prev_item_size; + + prev_item_size = btrfs_item_size_nr(leaf, slot - 1); + prev_csum_end = (prev_item_size / csumsize) * sectorsize; + prev_csum_end += prev_key->offset; + if (prev_csum_end > key->offset) { + generic_err(leaf, slot - 1, +"csum end range (%llu) goes beyond the start range (%llu) of the next csum item", + prev_csum_end, key->offset); + return -EUCLEAN; + } + } return 0; } @@ -1355,7 +1369,7 @@ static int check_leaf_item(struct extent_buffer *leaf, ret = check_extent_data_item(leaf, key, slot, prev_key); break; case BTRFS_EXTENT_CSUM_KEY: - ret = check_csum_item(leaf, key, slot); + ret = check_csum_item(leaf, key, slot, prev_key); break; case BTRFS_DIR_ITEM_KEY: case BTRFS_DIR_INDEX_KEY: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6f757361db53..d3f115909ff0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -808,7 +808,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_del_csums(trans, fs_info, + ret = btrfs_del_csums(trans, + fs_info->csum_root, sums->bytenr, sums->len); if (!ret) @@ -3909,6 +3910,28 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return 0; } +static int log_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_ordered_sum *sums) +{ + int ret; + + /* + * Due to extent cloning, we might have logged a csum item that covers a + * subrange of a cloned extent, and later we can end up logging a csum + * item for a larger subrange of the same extent or the entire range. + * This would leave csum items in the log tree that cover the same range + * and break the searches for checksums in the log tree, resulting in + * some checksums missing in the fs/subvolume tree. So just delete (or + * trim and adjust) any existing csum items in the log for this range. + */ + ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len); + if (ret) + return ret; + + return btrfs_csum_file_blocks(trans, log_root, sums); +} + static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *dst_path, @@ -4054,7 +4077,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log, sums); + ret = log_csums(trans, log, sums); list_del(&sums->list); kfree(sums); } @@ -4274,7 +4297,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_csum_file_blocks(trans, log_root, sums); + ret = log_csums(trans, log_root, sums); list_del(&sums->list); kfree(sums); } @@ -6294,9 +6317,28 @@ again: wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); if (IS_ERR(wc.replay_dest)) { ret = PTR_ERR(wc.replay_dest); + + /* + * We didn't find the subvol, likely because it was + * deleted. This is ok, simply skip this log and go to + * the next one. + * + * We need to exclude the root because we can't have + * other log replays overwriting this log as we'll read + * it back in a few more times. This will keep our + * block from being modified, and we'll just bail for + * each subsequent pass. + */ + if (ret == -ENOENT) + ret = btrfs_pin_extent_for_log_replay(fs_info, + log->node->start, + log->node->len); free_extent_buffer(log->node); free_extent_buffer(log->commit_root); kfree(log); + + if (!ret) + goto next; btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root for tree log recovery."); goto error; @@ -6328,7 +6370,6 @@ again: &root->highest_objectid); } - key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); free_extent_buffer(log->commit_root); @@ -6336,9 +6377,10 @@ again: if (ret) goto error; - +next: if (found_key.offset == 0) break; + key.offset = found_key.offset - 1; } btrfs_release_path(path); diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 91caab63bdf5..76b84f2397b1 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -324,6 +324,8 @@ again_search_slot: } if (ret < 0 && ret != -ENOENT) goto out; + key.offset++; + goto again_search_slot; } item_size -= sizeof(subid_le); offset += sizeof(subid_le); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d8e5560db285..a6d3f08bfff3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -61,7 +61,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID1C3] = { .sub_stripes = 1, .dev_stripes = 1, - .devs_max = 0, + .devs_max = 3, .devs_min = 3, .tolerated_failures = 2, .devs_increment = 3, @@ -73,7 +73,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID1C4] = { .sub_stripes = 1, .dev_stripes = 1, - .devs_max = 0, + .devs_max = 4, .devs_min = 4, .tolerated_failures = 3, .devs_increment = 4, diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index b2ec29eeb4c4..73f24f307a4a 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -8,6 +8,7 @@ #include <linux/ceph/ceph_debug.h> +#include <linux/fs_context.h> #include "super.h" #include "cache.h" @@ -49,7 +50,7 @@ void ceph_fscache_unregister(void) fscache_unregister_netfs(&ceph_cache_netfs); } -int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc) { const struct ceph_fsid *fsid = &fsc->client->fsid; const char *fscache_uniq = fsc->mount_options->fscache_uniq; @@ -66,8 +67,8 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) continue; - pr_err("fscache cookie already registered for fsid %pU\n", fsid); - pr_err(" use fsc=%%s mount option to specify a uniquifier\n"); + errorf(fc, "ceph: fscache cookie already registered for fsid %pU, use fsc=<uniquifier> option", + fsid); err = -EBUSY; goto out_unlock; } @@ -95,7 +96,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc) list_add_tail(&ent->list, &ceph_fscache_list); } else { kfree(ent); - pr_err("unable to register fscache cookie for fsid %pU\n", + errorf(fc, "ceph: unable to register fscache cookie for fsid %pU", fsid); /* all other fs ignore this error */ } diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index e486fac3434d..89dbdd1eb14a 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -16,7 +16,7 @@ extern struct fscache_netfs ceph_cache_netfs; int ceph_fscache_register(void); void ceph_fscache_unregister(void); -int ceph_fscache_register_fs(struct ceph_fs_client* fsc); +int ceph_fscache_register_fs(struct ceph_fs_client* fsc, struct fs_context *fc); void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc); void ceph_fscache_register_inode_cookie(struct inode *inode); @@ -88,7 +88,8 @@ static inline void ceph_fscache_unregister(void) { } -static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc) +static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc, + struct fs_context *fc) { return 0; } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f5a38910a82b..9d09bb53c1ab 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1011,18 +1011,13 @@ static int __ceph_is_single_caps(struct ceph_inode_info *ci) return rb_first(&ci->i_caps) == rb_last(&ci->i_caps); } -static int __ceph_is_any_caps(struct ceph_inode_info *ci) -{ - return !RB_EMPTY_ROOT(&ci->i_caps); -} - int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; spin_lock(&ci->i_ceph_lock); - ret = __ceph_is_any_caps(ci); + ret = __ceph_is_any_real_caps(ci); spin_unlock(&ci->i_ceph_lock); return ret; @@ -1099,15 +1094,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) if (removed) ceph_put_cap(mdsc, cap); - /* when reconnect denied, we remove session caps forcibly, - * i_wr_ref can be non-zero. If there are ongoing write, - * keep i_snap_realm. - */ - if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) - drop_inode_snap_realm(ci); + if (!__ceph_is_any_real_caps(ci)) { + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); - if (!__ceph_is_any_real_caps(ci)) __cap_delay_cancel(mdsc, ci); + } } struct cap_msg_args { @@ -2764,7 +2760,19 @@ int ceph_get_caps(struct file *filp, int need, int want, if (ret == -EAGAIN) continue; if (!ret) { + struct ceph_mds_client *mdsc = fsc->mdsc; + struct cap_wait cw; DEFINE_WAIT_FUNC(wait, woken_wake_function); + + cw.ino = inode->i_ino; + cw.tgid = current->tgid; + cw.need = need; + cw.want = want; + + spin_lock(&mdsc->caps_list_lock); + list_add(&cw.list, &mdsc->cap_wait_list); + spin_unlock(&mdsc->caps_list_lock); + add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; @@ -2778,6 +2786,11 @@ int ceph_get_caps(struct file *filp, int need, int want, } remove_wait_queue(&ci->i_cap_wq, &wait); + + spin_lock(&mdsc->caps_list_lock); + list_del(&cw.list); + spin_unlock(&mdsc->caps_list_lock); + if (ret == -EAGAIN) continue; } @@ -2928,7 +2941,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) drop_inode_snap_realm(ci); } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index facb387c2735..c281f32b54f7 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -139,6 +139,7 @@ static int caps_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mds_client *mdsc = fsc->mdsc; int total, avail, used, reserved, min, i; + struct cap_wait *cw; ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min); seq_printf(s, "total\t\t%d\n" @@ -166,6 +167,18 @@ static int caps_show(struct seq_file *s, void *p) } mutex_unlock(&mdsc->mutex); + seq_printf(s, "\n\nWaiters:\n--------\n"); + seq_printf(s, "tgid ino need want\n"); + seq_printf(s, "-----------------------------------------------------\n"); + + spin_lock(&mdsc->caps_list_lock); + list_for_each_entry(cw, &mdsc->cap_wait_list, list) { + seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino, + ceph_cap_string(cw->need), + ceph_cap_string(cw->want)); + } + spin_unlock(&mdsc->caps_list_lock); + return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a5163296d9d9..374db1bd57d1 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2015,7 +2015,7 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) if (!nr) return; val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); - if (!(val % CEPH_CAPS_PER_RELEASE)) { + if ((val % CEPH_CAPS_PER_RELEASE) < nr) { atomic_set(&mdsc->cap_reclaim_pending, 0); ceph_queue_cap_reclaim_work(mdsc); } @@ -2032,12 +2032,13 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; size_t size = sizeof(struct ceph_mds_reply_dir_entry); - int order, num_entries; + unsigned int num_entries; + int order; spin_lock(&ci->i_ceph_lock); num_entries = ci->i_files + ci->i_subdirs; spin_unlock(&ci->i_ceph_lock); - num_entries = max(num_entries, 1); + num_entries = max(num_entries, 1U); num_entries = min(num_entries, opt->max_readdir); order = get_order(size * num_entries); @@ -2182,13 +2183,17 @@ retry: } base = ceph_ino(d_inode(temp)); rcu_read_unlock(); - if (pos < 0 || read_seqretry(&rename_lock, seq)) { - pr_err("build_path did not end path lookup where " - "expected, pos is %d\n", pos); - /* presumably this is only possible if racing with a - rename of one of the parent directories (we can not - lock the dentries above us to prevent this, but - retrying should be harmless) */ + + if (read_seqretry(&rename_lock, seq)) + goto retry; + + if (pos < 0) { + /* + * A rename didn't occur, but somehow we didn't end up where + * we thought we would. Throw a warning and try again. + */ + pr_warn("build_path did not end path lookup where " + "expected, pos is %d\n", pos); goto retry; } @@ -2345,6 +2350,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->op = cpu_to_le32(req->r_op); head->caller_uid = cpu_to_le32(from_kuid(&init_user_ns, req->r_uid)); head->caller_gid = cpu_to_le32(from_kgid(&init_user_ns, req->r_gid)); + head->ino = 0; head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); @@ -4163,6 +4169,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; INIT_LIST_HEAD(&mdsc->cap_delay_list); + INIT_LIST_HEAD(&mdsc->cap_wait_list); spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 5cd131b41d84..14c7e8c49970 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -340,6 +340,14 @@ struct ceph_quotarealm_inode { struct inode *inode; }; +struct cap_wait { + struct list_head list; + unsigned long ino; + pid_t tgid; + int need; + int want; +}; + /* * mds client state */ @@ -416,6 +424,7 @@ struct ceph_mds_client { spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ + struct list_head cap_wait_list; int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index ce2d00da5096..471bac335fae 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -20,7 +20,7 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) { int n = 0; - int i; + int i, j; /* special case for one mds */ if (1 == m->m_num_mds && m->m_info[0].state > 0) @@ -35,9 +35,12 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) /* pick */ n = prandom_u32() % n; - for (i = 0; n > 0; i++, n--) - while (m->m_info[i].state <= 0) - i++; + for (j = 0, i = 0; i < m->m_num_mds; i++) { + if (m->m_info[i].state > 0) + j++; + if (j > n) + break; + } return i; } @@ -155,6 +158,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) void *pexport_targets = NULL; struct ceph_timespec laggy_since; struct ceph_mds_info *info; + bool laggy; ceph_decode_need(p, end, sizeof(u64) + 1, bad); global_id = ceph_decode_64(p); @@ -187,6 +191,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (err) goto corrupt; ceph_decode_copy(p, &laggy_since, sizeof(laggy_since)); + laggy = laggy_since.tv_sec != 0 || laggy_since.tv_nsec != 0; *p += sizeof(u32); ceph_decode_32_safe(p, end, namelen, bad); *p += namelen; @@ -204,10 +209,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) *p = info_end; } - dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", + dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s%s\n", i+1, n, global_id, mds, inc, ceph_pr_addr(&addr), - ceph_mds_state_name(state)); + ceph_mds_state_name(state), + laggy ? "(laggy)" : ""); if (mds < 0 || state <= 0) continue; @@ -227,8 +233,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->global_id = global_id; info->state = state; info->addr = addr; - info->laggy = (laggy_since.tv_sec != 0 || - laggy_since.tv_nsec != 0); + info->laggy = laggy; info->num_export_targets = num_export_targets; if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, @@ -352,6 +357,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_damaged = false; } bad_ext: + dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n", + !!m->m_enabled, !!m->m_damaged, m->m_num_laggy); *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b47f43fc2d68..29a795f975df 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -9,7 +9,8 @@ #include <linux/in6.h> #include <linux/module.h> #include <linux/mount.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -138,280 +139,310 @@ enum { Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, - Opt_last_int, /* int args above */ Opt_snapdirname, Opt_mds_namespace, - Opt_fscache_uniq, Opt_recover_session, - Opt_last_string, + Opt_source, /* string args above */ Opt_dirstat, - Opt_nodirstat, Opt_rbytes, - Opt_norbytes, Opt_asyncreaddir, - Opt_noasyncreaddir, Opt_dcache, - Opt_nodcache, Opt_ino32, - Opt_noino32, Opt_fscache, - Opt_nofscache, Opt_poolperm, - Opt_nopoolperm, Opt_require_active_mds, - Opt_norequire_active_mds, -#ifdef CONFIG_CEPH_FS_POSIX_ACL Opt_acl, -#endif - Opt_noacl, Opt_quotadf, - Opt_noquotadf, Opt_copyfrom, - Opt_nocopyfrom, }; -static match_table_t fsopt_tokens = { - {Opt_wsize, "wsize=%d"}, - {Opt_rsize, "rsize=%d"}, - {Opt_rasize, "rasize=%d"}, - {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, - {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, - {Opt_caps_max, "caps_max=%d"}, - {Opt_readdir_max_entries, "readdir_max_entries=%d"}, - {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, - {Opt_congestion_kb, "write_congestion_kb=%d"}, - /* int args above */ - {Opt_snapdirname, "snapdirname=%s"}, - {Opt_mds_namespace, "mds_namespace=%s"}, - {Opt_recover_session, "recover_session=%s"}, - {Opt_fscache_uniq, "fsc=%s"}, - /* string args above */ - {Opt_dirstat, "dirstat"}, - {Opt_nodirstat, "nodirstat"}, - {Opt_rbytes, "rbytes"}, - {Opt_norbytes, "norbytes"}, - {Opt_asyncreaddir, "asyncreaddir"}, - {Opt_noasyncreaddir, "noasyncreaddir"}, - {Opt_dcache, "dcache"}, - {Opt_nodcache, "nodcache"}, - {Opt_ino32, "ino32"}, - {Opt_noino32, "noino32"}, - {Opt_fscache, "fsc"}, - {Opt_nofscache, "nofsc"}, - {Opt_poolperm, "poolperm"}, - {Opt_nopoolperm, "nopoolperm"}, - {Opt_require_active_mds, "require_active_mds"}, - {Opt_norequire_active_mds, "norequire_active_mds"}, -#ifdef CONFIG_CEPH_FS_POSIX_ACL - {Opt_acl, "acl"}, -#endif - {Opt_noacl, "noacl"}, - {Opt_quotadf, "quotadf"}, - {Opt_noquotadf, "noquotadf"}, - {Opt_copyfrom, "copyfrom"}, - {Opt_nocopyfrom, "nocopyfrom"}, - {-1, NULL} +enum ceph_recover_session_mode { + ceph_recover_session_no, + ceph_recover_session_clean +}; + +static const struct fs_parameter_enum ceph_mount_param_enums[] = { + { Opt_recover_session, "no", ceph_recover_session_no }, + { Opt_recover_session, "clean", ceph_recover_session_clean }, + {} +}; + +static const struct fs_parameter_spec ceph_mount_param_specs[] = { + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag_no ("asyncreaddir", Opt_asyncreaddir), + fsparam_s32 ("caps_max", Opt_caps_max), + fsparam_u32 ("caps_wanted_delay_max", Opt_caps_wanted_delay_max), + fsparam_u32 ("caps_wanted_delay_min", Opt_caps_wanted_delay_min), + fsparam_u32 ("write_congestion_kb", Opt_congestion_kb), + fsparam_flag_no ("copyfrom", Opt_copyfrom), + fsparam_flag_no ("dcache", Opt_dcache), + fsparam_flag_no ("dirstat", Opt_dirstat), + __fsparam (fs_param_is_string, "fsc", Opt_fscache, + fs_param_neg_with_no | fs_param_v_optional), + fsparam_flag_no ("ino32", Opt_ino32), + fsparam_string ("mds_namespace", Opt_mds_namespace), + fsparam_flag_no ("poolperm", Opt_poolperm), + fsparam_flag_no ("quotadf", Opt_quotadf), + fsparam_u32 ("rasize", Opt_rasize), + fsparam_flag_no ("rbytes", Opt_rbytes), + fsparam_u32 ("readdir_max_bytes", Opt_readdir_max_bytes), + fsparam_u32 ("readdir_max_entries", Opt_readdir_max_entries), + fsparam_enum ("recover_session", Opt_recover_session), + fsparam_flag_no ("require_active_mds", Opt_require_active_mds), + fsparam_u32 ("rsize", Opt_rsize), + fsparam_string ("snapdirname", Opt_snapdirname), + fsparam_string ("source", Opt_source), + fsparam_u32 ("wsize", Opt_wsize), + {} +}; + +static const struct fs_parameter_description ceph_mount_parameters = { + .name = "ceph", + .specs = ceph_mount_param_specs, + .enums = ceph_mount_param_enums, }; -static int parse_fsopt_token(char *c, void *private) +struct ceph_parse_opts_ctx { + struct ceph_options *copts; + struct ceph_mount_options *opts; +}; + +/* + * Parse the source parameter. Distinguish the server list from the path. + * Internally we do not include the leading '/' in the path. + * + * The source will look like: + * <server_spec>[,<server_spec>...]:[<path>] + * where + * <server_spec> is <ip>[:<port>] + * <path> is optional, but if present must begin with '/' + */ +static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) { - struct ceph_mount_options *fsopt = private; - substring_t argstr[MAX_OPT_ARGS]; - int token, intval, ret; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *dev_name = param->string, *dev_name_end; + int ret; - token = match_token((char *)c, fsopt_tokens, argstr); - if (token < 0) - return -EINVAL; + dout("%s '%s'\n", __func__, dev_name); + if (!dev_name || !*dev_name) + return invalf(fc, "ceph: Empty source"); - if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { - pr_err("bad option arg (not int) at '%s'\n", c); - return ret; + dev_name_end = strchr(dev_name, '/'); + if (dev_name_end) { + if (strlen(dev_name_end) > 1) { + kfree(fsopt->server_path); + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); + if (!fsopt->server_path) + return -ENOMEM; } - dout("got int token %d val %d\n", token, intval); - } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); } else { - dout("got token %d\n", token); + dev_name_end = dev_name + strlen(dev_name); } + dev_name_end--; /* back up to ':' separator */ + if (dev_name_end < dev_name || *dev_name_end != ':') + return invalf(fc, "ceph: No path or : separator in source"); + + dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); + if (fsopt->server_path) + dout("server path '%s'\n", fsopt->server_path); + + ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, + pctx->copts, fc); + if (ret) + return ret; + + fc->source = param->string; + param->string = NULL; + return 0; +} + +static int ceph_parse_mount_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + struct fs_parse_result result; + unsigned int mode; + int token, ret; + + ret = ceph_parse_param(param, pctx->copts, fc); + if (ret != -ENOPARAM) + return ret; + + token = fs_parse(fc, &ceph_mount_parameters, param, &result); + dout("%s fs_parse '%s' token %d\n", __func__, param->key, token); + if (token < 0) + return token; + switch (token) { case Opt_snapdirname: kfree(fsopt->snapdir_name); - fsopt->snapdir_name = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->snapdir_name) - return -ENOMEM; + fsopt->snapdir_name = param->string; + param->string = NULL; break; case Opt_mds_namespace: kfree(fsopt->mds_namespace); - fsopt->mds_namespace = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->mds_namespace) - return -ENOMEM; + fsopt->mds_namespace = param->string; + param->string = NULL; break; case Opt_recover_session: - if (!strncmp(argstr[0].from, "no", - argstr[0].to - argstr[0].from)) { + mode = result.uint_32; + if (mode == ceph_recover_session_no) fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; - } else if (!strncmp(argstr[0].from, "clean", - argstr[0].to - argstr[0].from)) { + else if (mode == ceph_recover_session_clean) fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; - } else { - return -EINVAL; - } - break; - case Opt_fscache_uniq: -#ifdef CONFIG_CEPH_FSCACHE - kfree(fsopt->fscache_uniq); - fsopt->fscache_uniq = kstrndup(argstr[0].from, - argstr[0].to-argstr[0].from, - GFP_KERNEL); - if (!fsopt->fscache_uniq) - return -ENOMEM; - fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + else + BUG(); break; -#else - pr_err("fscache support is disabled\n"); - return -EINVAL; -#endif + case Opt_source: + if (fc->source) + return invalf(fc, "ceph: Multiple sources specified"); + return ceph_parse_source(param, fc); case Opt_wsize: - if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) - return -EINVAL; - fsopt->wsize = ALIGN(intval, PAGE_SIZE); + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_WRITE_SIZE) + goto out_of_range; + fsopt->wsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rsize: - if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) - return -EINVAL; - fsopt->rsize = ALIGN(intval, PAGE_SIZE); + if (result.uint_32 < PAGE_SIZE || + result.uint_32 > CEPH_MAX_READ_SIZE) + goto out_of_range; + fsopt->rsize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_rasize: - if (intval < 0) - return -EINVAL; - fsopt->rasize = ALIGN(intval, PAGE_SIZE); + fsopt->rasize = ALIGN(result.uint_32, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: - if (intval < 1) - return -EINVAL; - fsopt->caps_wanted_delay_min = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_min = result.uint_32; break; case Opt_caps_wanted_delay_max: - if (intval < 1) - return -EINVAL; - fsopt->caps_wanted_delay_max = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->caps_wanted_delay_max = result.uint_32; break; case Opt_caps_max: - if (intval < 0) - return -EINVAL; - fsopt->caps_max = intval; + if (result.int_32 < 0) + goto out_of_range; + fsopt->caps_max = result.int_32; break; case Opt_readdir_max_entries: - if (intval < 1) - return -EINVAL; - fsopt->max_readdir = intval; + if (result.uint_32 < 1) + goto out_of_range; + fsopt->max_readdir = result.uint_32; break; case Opt_readdir_max_bytes: - if (intval < (int)PAGE_SIZE && intval != 0) - return -EINVAL; - fsopt->max_readdir_bytes = intval; + if (result.uint_32 < PAGE_SIZE && result.uint_32 != 0) + goto out_of_range; + fsopt->max_readdir_bytes = result.uint_32; break; case Opt_congestion_kb: - if (intval < 1024) /* at least 1M */ - return -EINVAL; - fsopt->congestion_kb = intval; + if (result.uint_32 < 1024) /* at least 1M */ + goto out_of_range; + fsopt->congestion_kb = result.uint_32; break; case Opt_dirstat: - fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; - break; - case Opt_nodirstat: - fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT; break; case Opt_rbytes: - fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; - break; - case Opt_norbytes: - fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_RBYTES; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES; break; case Opt_asyncreaddir: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; - break; - case Opt_noasyncreaddir: - fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOASYNCREADDIR; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; break; case Opt_dcache: - fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; - break; - case Opt_nodcache: - fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_DCACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE; break; case Opt_ino32: - fsopt->flags |= CEPH_MOUNT_OPT_INO32; - break; - case Opt_noino32: - fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; + if (!result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_INO32; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_INO32; break; + case Opt_fscache: #ifdef CONFIG_CEPH_FSCACHE - fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = NULL; + if (result.negated) { + fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; + } else { + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; + fsopt->fscache_uniq = param->string; + param->string = NULL; + } break; #else - pr_err("fscache support is disabled\n"); - return -EINVAL; + return invalf(fc, "ceph: fscache support is disabled"); #endif - case Opt_nofscache: - fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; - kfree(fsopt->fscache_uniq); - fsopt->fscache_uniq = NULL; - break; case Opt_poolperm: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; - break; - case Opt_nopoolperm: - fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; break; case Opt_require_active_mds: - fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; - break; - case Opt_norequire_active_mds: - fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; + else + fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; break; case Opt_quotadf: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; - break; - case Opt_noquotadf: - fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; break; case Opt_copyfrom: - fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; - break; - case Opt_nocopyfrom: - fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; + if (!result.negated) + fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; + else + fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; break; -#ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: - fsopt->sb_flags |= SB_POSIXACL; - break; + if (!result.negated) { +#ifdef CONFIG_CEPH_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#else + return invalf(fc, "ceph: POSIX ACL support is disabled"); #endif - case Opt_noacl: - fsopt->sb_flags &= ~SB_POSIXACL; + } else { + fc->sb_flags &= ~SB_POSIXACL; + } break; default: - BUG_ON(token); + BUG(); } return 0; + +out_of_range: + return invalf(fc, "ceph: %s out of range", param->key); } static void destroy_mount_options(struct ceph_mount_options *args) { dout("destroy_mount_options %p\n", args); + if (!args) + return; + kfree(args->snapdir_name); kfree(args->mds_namespace); kfree(args->server_path); @@ -459,91 +490,6 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, return ceph_compare_options(new_opt, fsc->client); } -static int parse_mount_options(struct ceph_mount_options **pfsopt, - struct ceph_options **popt, - int flags, char *options, - const char *dev_name) -{ - struct ceph_mount_options *fsopt; - const char *dev_name_end; - int err; - - if (!dev_name || !*dev_name) - return -EINVAL; - - fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL); - if (!fsopt) - return -ENOMEM; - - dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); - - fsopt->sb_flags = flags; - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - - fsopt->wsize = CEPH_MAX_WRITE_SIZE; - fsopt->rsize = CEPH_MAX_READ_SIZE; - fsopt->rasize = CEPH_RASIZE_DEFAULT; - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); - if (!fsopt->snapdir_name) { - err = -ENOMEM; - goto out; - } - - fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; - fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; - fsopt->congestion_kb = default_congestion_kb(); - - /* - * Distinguish the server list from the path in "dev_name". - * Internally we do not include the leading '/' in the path. - * - * "dev_name" will look like: - * <server_spec>[,<server_spec>...]:[<path>] - * where - * <server_spec> is <ip>[:<port>] - * <path> is optional, but if present must begin with '/' - */ - dev_name_end = strchr(dev_name, '/'); - if (dev_name_end) { - if (strlen(dev_name_end) > 1) { - fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); - if (!fsopt->server_path) { - err = -ENOMEM; - goto out; - } - } - } else { - dev_name_end = dev_name + strlen(dev_name); - } - err = -EINVAL; - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') { - pr_err("device name is missing path (no : separator in %s)\n", - dev_name); - goto out; - } - dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); - if (fsopt->server_path) - dout("server path '%s'\n", fsopt->server_path); - - *popt = ceph_parse_options(options, dev_name, dev_name_end, - parse_fsopt_token, (void *)fsopt); - if (IS_ERR(*popt)) { - err = PTR_ERR(*popt); - goto out; - } - - /* success */ - *pfsopt = fsopt; - return 0; - -out: - destroy_mount_options(fsopt); - return err; -} - /** * ceph_show_options - Show mount options in /proc/mounts * @m: seq_file to write to @@ -587,7 +533,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",noquotadf"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - if (fsopt->sb_flags & SB_POSIXACL) + if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); else seq_puts(m, ",noacl"); @@ -603,25 +549,25 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_show_option(m, "recover_session", "clean"); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) - seq_printf(m, ",wsize=%d", fsopt->wsize); + seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) - seq_printf(m, ",rsize=%d", fsopt->rsize); + seq_printf(m, ",rsize=%u", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) - seq_printf(m, ",rasize=%d", fsopt->rasize); + seq_printf(m, ",rasize=%u", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) - seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + seq_printf(m, ",write_congestion_kb=%u", fsopt->congestion_kb); if (fsopt->caps_max) seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) - seq_printf(m, ",caps_wanted_delay_min=%d", + seq_printf(m, ",caps_wanted_delay_min=%u", fsopt->caps_wanted_delay_min); if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) - seq_printf(m, ",caps_wanted_delay_max=%d", + seq_printf(m, ",caps_wanted_delay_max=%u", fsopt->caps_wanted_delay_max); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) - seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); + seq_printf(m, ",readdir_max_entries=%u", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) - seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); + seq_printf(m, ",readdir_max_bytes=%u", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_show_option(m, "snapdirname", fsopt->snapdir_name); @@ -860,12 +806,6 @@ static void ceph_umount_begin(struct super_block *sb) fsc->filp_gen++; // invalidate open files } -static int ceph_remount(struct super_block *sb, int *flags, char *data) -{ - sync_filesystem(sb); - return 0; -} - static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, @@ -874,7 +814,6 @@ static const struct super_operations ceph_super_ops = { .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, - .remount_fs = ceph_remount, .show_options = ceph_show_options, .statfs = ceph_statfs, .umount_begin = ceph_umount_begin, @@ -935,7 +874,8 @@ out: /* * mount: join the ceph cluster, and open root directory. */ -static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, + struct fs_context *fc) { int err; unsigned long started = jiffies; /* note the start time */ @@ -952,7 +892,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) /* setup fscache */ if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) { - err = ceph_fscache_register_fs(fsc); + err = ceph_fscache_register_fs(fsc, fc); if (err < 0) goto out; } @@ -987,18 +927,16 @@ out: return ERR_PTR(err); } -static int ceph_set_super(struct super_block *s, void *data) +static int ceph_set_super(struct super_block *s, struct fs_context *fc) { - struct ceph_fs_client *fsc = data; + struct ceph_fs_client *fsc = s->s_fs_info; int ret; - dout("set_super %p data %p\n", s, data); + dout("set_super %p\n", s); - s->s_flags = fsc->mount_options->sb_flags; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_xattr = ceph_xattr_handlers; - s->s_fs_info = fsc; fsc->sb = s; fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */ @@ -1010,24 +948,18 @@ static int ceph_set_super(struct super_block *s, void *data) s->s_time_min = 0; s->s_time_max = U32_MAX; - ret = set_anon_super(s, NULL); /* what is that second arg for? */ + ret = set_anon_super_fc(s, fc); if (ret != 0) - goto fail; - - return ret; - -fail: - s->s_fs_info = NULL; - fsc->sb = NULL; + fsc->sb = NULL; return ret; } /* * share superblock if same fs AND options */ -static int ceph_compare_super(struct super_block *sb, void *data) +static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) { - struct ceph_fs_client *new = data; + struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; struct ceph_fs_client *other = ceph_sb_to_client(sb); @@ -1043,7 +975,7 @@ static int ceph_compare_super(struct super_block *sb, void *data) dout("fsid doesn't match\n"); return 0; } - if (fsopt->sb_flags != other->mount_options->sb_flags) { + if (fc->sb_flags != (sb->s_flags & ~SB_BORN)) { dout("flags differ\n"); return 0; } @@ -1073,46 +1005,46 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) return 0; } -static struct dentry *ceph_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ceph_get_tree(struct fs_context *fc) { + struct ceph_parse_opts_ctx *pctx = fc->fs_private; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; + int (*compare_super)(struct super_block *, struct fs_context *) = + ceph_compare_super; int err; - int (*compare_super)(struct super_block *, void *) = ceph_compare_super; - struct ceph_mount_options *fsopt = NULL; - struct ceph_options *opt = NULL; - dout("ceph_mount\n"); + dout("ceph_get_tree\n"); + + if (!fc->source) + return invalf(fc, "ceph: No source"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - flags |= SB_POSIXACL; + fc->sb_flags |= SB_POSIXACL; #endif - err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); - if (err < 0) { - res = ERR_PTR(err); - goto out_final; - } /* create client (which we may/may not use) */ - fsc = create_fs_client(fsopt, opt); + fsc = create_fs_client(pctx->opts, pctx->copts); + pctx->opts = NULL; + pctx->copts = NULL; if (IS_ERR(fsc)) { - res = ERR_CAST(fsc); + err = PTR_ERR(fsc); goto out_final; } err = ceph_mdsc_init(fsc); - if (err < 0) { - res = ERR_PTR(err); + if (err < 0) goto out; - } if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; - sb = sget(fs_type, compare_super, ceph_set_super, flags, fsc); + + fc->s_fs_info = fsc; + sb = sget_fc(fc, compare_super, ceph_set_super); + fc->s_fs_info = NULL; if (IS_ERR(sb)) { - res = ERR_CAST(sb); + err = PTR_ERR(sb); goto out; } @@ -1123,18 +1055,19 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } else { dout("get_sb using new client %p\n", fsc); err = ceph_setup_bdi(sb, fsc); - if (err < 0) { - res = ERR_PTR(err); + if (err < 0) goto out_splat; - } } - res = ceph_real_mount(fsc); - if (IS_ERR(res)) + res = ceph_real_mount(fsc, fc); + if (IS_ERR(res)) { + err = PTR_ERR(res); goto out_splat; + } dout("root %p inode %p ino %llx.%llx\n", res, d_inode(res), ceph_vinop(d_inode(res))); - return res; + fc->root = fsc->sb->s_root; + return 0; out_splat: ceph_mdsc_close_sessions(fsc->mdsc); @@ -1144,8 +1077,79 @@ out_splat: out: destroy_fs_client(fsc); out_final: - dout("ceph_mount fail %ld\n", PTR_ERR(res)); - return res; + dout("ceph_get_tree fail %d\n", err); + return err; +} + +static void ceph_free_fc(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + + if (pctx) { + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + } +} + +static int ceph_reconfigure_fc(struct fs_context *fc) +{ + sync_filesystem(fc->root->d_sb); + return 0; +} + +static const struct fs_context_operations ceph_context_ops = { + .free = ceph_free_fc, + .parse_param = ceph_parse_mount_param, + .get_tree = ceph_get_tree, + .reconfigure = ceph_reconfigure_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int ceph_init_fs_context(struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx; + struct ceph_mount_options *fsopt; + + pctx = kzalloc(sizeof(*pctx), GFP_KERNEL); + if (!pctx) + return -ENOMEM; + + pctx->copts = ceph_alloc_options(); + if (!pctx->copts) + goto nomem; + + pctx->opts = kzalloc(sizeof(*pctx->opts), GFP_KERNEL); + if (!pctx->opts) + goto nomem; + + fsopt = pctx->opts; + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; + + fsopt->wsize = CEPH_MAX_WRITE_SIZE; + fsopt->rsize = CEPH_MAX_READ_SIZE; + fsopt->rasize = CEPH_RASIZE_DEFAULT; + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) + goto nomem; + + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; + fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; + fsopt->congestion_kb = default_congestion_kb(); + + fc->fs_private = pctx; + fc->ops = &ceph_context_ops; + return 0; + +nomem: + destroy_mount_options(pctx->opts); + ceph_destroy_options(pctx->copts); + kfree(pctx); + return -ENOMEM; } static void ceph_kill_sb(struct super_block *s) @@ -1172,7 +1176,7 @@ static void ceph_kill_sb(struct super_block *s) static struct file_system_type ceph_fs_type = { .owner = THIS_MODULE, .name = "ceph", - .mount = ceph_mount, + .init_fs_context = ceph_init_fs_context, .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f98d9247f9cb..3bf1a01cd536 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -73,17 +73,16 @@ #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ struct ceph_mount_options { - int flags; - int sb_flags; - - int wsize; /* max write size */ - int rsize; /* max read size */ - int rasize; /* max readahead */ - int congestion_kb; /* max writeback in flight */ - int caps_wanted_delay_min, caps_wanted_delay_max; + unsigned int flags; + + unsigned int wsize; /* max write size */ + unsigned int rsize; /* max read size */ + unsigned int rasize; /* max readahead */ + unsigned int congestion_kb; /* max writeback in flight */ + unsigned int caps_wanted_delay_min, caps_wanted_delay_max; int caps_max; - int max_readdir; /* max readdir result (entires) */ - int max_readdir_bytes; /* max readdir result (bytes) */ + unsigned int max_readdir; /* max readdir result (entries) */ + unsigned int max_readdir_bytes; /* max readdir result (bytes) */ /* * everything above this point can be memcmp'd; everything below @@ -407,22 +406,26 @@ struct ceph_inode_info { struct inode vfs_inode; /* at end */ }; -static inline struct ceph_inode_info *ceph_inode(struct inode *inode) +static inline struct ceph_inode_info * +ceph_inode(const struct inode *inode) { return container_of(inode, struct ceph_inode_info, vfs_inode); } -static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode) +static inline struct ceph_fs_client * +ceph_inode_to_client(const struct inode *inode) { return (struct ceph_fs_client *)inode->i_sb->s_fs_info; } -static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb) +static inline struct ceph_fs_client * +ceph_sb_to_client(const struct super_block *sb) { return (struct ceph_fs_client *)sb->s_fs_info; } -static inline struct ceph_vino ceph_vino(struct inode *inode) +static inline struct ceph_vino +ceph_vino(const struct inode *inode) { return ceph_inode(inode)->i_vino; } diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 06ffe52bdcfa..96ae72b556ac 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -802,6 +802,31 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, return; } +/* + * Fill in the special SID based on the mode. See + * http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx + */ +unsigned int setup_special_mode_ACE(struct cifs_ace *pntace, __u64 nmode) +{ + int i; + unsigned int ace_size = 28; + + pntace->type = ACCESS_DENIED_ACE_TYPE; + pntace->flags = 0x0; + pntace->access_req = 0; + pntace->sid.num_subauth = 3; + pntace->sid.revision = 1; + for (i = 0; i < NUM_AUTHS; i++) + pntace->sid.authority[i] = sid_unix_NFS_mode.authority[i]; + + pntace->sid.sub_auth[0] = sid_unix_NFS_mode.sub_auth[0]; + pntace->sid.sub_auth[1] = sid_unix_NFS_mode.sub_auth[1]; + pntace->sid.sub_auth[2] = cpu_to_le32(nmode & 07777); + + /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */ + pntace->size = cpu_to_le16(ace_size); + return ace_size; +} static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, __u64 nmode, bool modefromsid) @@ -815,23 +840,8 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, if (modefromsid) { struct cifs_ace *pntace = (struct cifs_ace *)((char *)pnndacl + size); - int i; - pntace->type = ACCESS_ALLOWED; - pntace->flags = 0x0; - pntace->access_req = 0; - pntace->sid.num_subauth = 3; - pntace->sid.revision = 1; - for (i = 0; i < NUM_AUTHS; i++) - pntace->sid.authority[i] = - sid_unix_NFS_mode.authority[i]; - pntace->sid.sub_auth[0] = sid_unix_NFS_mode.sub_auth[0]; - pntace->sid.sub_auth[1] = sid_unix_NFS_mode.sub_auth[1]; - pntace->sid.sub_auth[2] = cpu_to_le32(nmode & 07777); - - /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */ - pntace->size = cpu_to_le16(28); - size += 28; + size += setup_special_mode_ACE(pntace, nmode); num_aces++; } diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 439b99cefeb0..21d7dee98d01 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -147,22 +147,22 @@ struct smb3_sd { } __packed; /* Meaning of 'Control' field flags */ -#define ACL_CONTROL_SR 0x0001 /* Self relative */ -#define ACL_CONTROL_RM 0x0002 /* Resource manager control bits */ -#define ACL_CONTROL_PS 0x0004 /* SACL protected from inherits */ -#define ACL_CONTROL_PD 0x0008 /* DACL protected from inherits */ -#define ACL_CONTROL_SI 0x0010 /* SACL Auto-Inherited */ -#define ACL_CONTROL_DI 0x0020 /* DACL Auto-Inherited */ -#define ACL_CONTROL_SC 0x0040 /* SACL computed through inheritance */ -#define ACL_CONTROL_DC 0x0080 /* DACL computed through inheritence */ -#define ACL_CONTROL_SS 0x0100 /* Create server ACL */ -#define ACL_CONTROL_DT 0x0200 /* DACL provided by trusteed source */ -#define ACL_CONTROL_SD 0x0400 /* SACL defaulted */ -#define ACL_CONTROL_SP 0x0800 /* SACL is present on object */ -#define ACL_CONTROL_DD 0x1000 /* DACL defaulted */ -#define ACL_CONTROL_DP 0x2000 /* DACL is present on object */ -#define ACL_CONTROL_GD 0x4000 /* Group was defaulted */ -#define ACL_CONTROL_OD 0x8000 /* User was defaulted */ +#define ACL_CONTROL_SR 0x8000 /* Self relative */ +#define ACL_CONTROL_RM 0x4000 /* Resource manager control bits */ +#define ACL_CONTROL_PS 0x2000 /* SACL protected from inherits */ +#define ACL_CONTROL_PD 0x1000 /* DACL protected from inherits */ +#define ACL_CONTROL_SI 0x0800 /* SACL Auto-Inherited */ +#define ACL_CONTROL_DI 0x0400 /* DACL Auto-Inherited */ +#define ACL_CONTROL_SC 0x0200 /* SACL computed through inheritance */ +#define ACL_CONTROL_DC 0x0100 /* DACL computed through inheritence */ +#define ACL_CONTROL_SS 0x0080 /* Create server ACL */ +#define ACL_CONTROL_DT 0x0040 /* DACL provided by trusted source */ +#define ACL_CONTROL_SD 0x0020 /* SACL defaulted */ +#define ACL_CONTROL_SP 0x0010 /* SACL is present on object */ +#define ACL_CONTROL_DD 0x0008 /* DACL defaulted */ +#define ACL_CONTROL_DP 0x0004 /* DACL is present on object */ +#define ACL_CONTROL_GD 0x0002 /* Group was defaulted */ +#define ACL_CONTROL_OD 0x0001 /* User was defaulted */ /* Meaning of AclRevision flags */ #define ACL_REVISION 0x02 /* See section 2.4.4.1 of MS-DTYP */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 1d1051d31513..5492b9860baa 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -730,11 +730,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) struct inode *dir = d_inode(dentry); struct dentry *child; - if (!dir) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); - break; - } if (!S_ISDIR(dir->i_mode)) { dput(dentry); dentry = ERR_PTR(-ENOTDIR); @@ -751,7 +746,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - child = lookup_one_len_unlocked(p, dentry, s - p); + child = lookup_positive_unlocked(p, dentry, s - p); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index d34a4ed8c57d..ce9bac756c2a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -368,6 +368,9 @@ struct smb_version_operations { /* close a file */ void (*close)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); + /* close a file, returning file attributes and timestamps */ + void (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *pfile_info); /* send a flush request to the server */ int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *); /* async read from the server */ @@ -774,6 +777,7 @@ struct TCP_Server_Info { */ int nr_targets; bool noblockcnt; /* use non-blocking connect() */ + bool is_channel; /* if a session channel */ }; struct cifs_credits { @@ -1057,7 +1061,7 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; - + bool has_lease:1; struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1ed695336f62..9c229408a251 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -213,6 +213,7 @@ extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); +extern unsigned int setup_special_mode_ACE(struct cifs_ace *pace, __u64 nmode); extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 4f554f019a98..cc86a67225d1 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -42,6 +42,7 @@ #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" +#include "smb2proto.h" #include "fscache.h" #include "smbdirect.h" #ifdef CONFIG_CIFS_DFS_UPCALL @@ -112,6 +113,8 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_lock(&tcon->crfid.fid_mutex); tcon->crfid.is_valid = false; + /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ + close_shroot_lease_locked(&tcon->crfid); memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 86d1baedf21c..05ea0e2b7e0e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2712,7 +2712,11 @@ cifs_find_tcp_session(struct smb_vol *vol) spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { - if (!match_server(server, vol)) + /* + * Skip ses channels since they're only handled in lower layers + * (e.g. cifs_send_recv). + */ + if (server->is_channel || !match_server(server, vol)) continue; ++server->srv_count; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index f1fe9c44d298..043288b5c728 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -315,9 +315,6 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_LIST_HEAD(&fdlocks->locks); fdlocks->cfile = cfile; cfile->llist = fdlocks; - cifs_down_write(&cinode->lock_sem); - list_add(&fdlocks->llist, &cinode->llist); - up_write(&cinode->lock_sem); cfile->count = 1; cfile->pid = current->tgid; @@ -342,6 +339,10 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, oplock = 0; } + cifs_down_write(&cinode->lock_sem); + list_add(&fdlocks->llist, &cinode->llist); + up_write(&cinode->lock_sem); + spin_lock(&tcon->open_file_lock); if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock) oplock = fid->pending_open->oplock; @@ -495,7 +496,9 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, unsigned int xid; xid = get_xid(); - if (server->ops->close) + if (server->ops->close_getattr) + server->ops->close_getattr(xid, tcon, cifs_file); + else if (server->ops->close) server->ops->close(xid, tcon, &cifs_file->fid); _free_xid(xid); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 8a76195e8a69..ca76a9287456 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -163,7 +163,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) spin_lock(&inode->i_lock); /* we do not want atime to be less than mtime, it broke some apps */ - if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime)) + if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0) inode->i_atime = fattr->cf_mtime; else inode->i_atime = fattr->cf_atime; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index fb3bdc44775c..f0795c856d8f 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -77,6 +77,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses) int i = 0; int rc = 0; int tries = 0; + struct cifs_server_iface *ifaces = NULL; + size_t iface_count; if (left <= 0) { cifs_dbg(FYI, @@ -91,6 +93,26 @@ int cifs_try_adding_channels(struct cifs_ses *ses) } /* + * Make a copy of the iface list at the time and use that + * instead so as to not hold the iface spinlock for opening + * channels + */ + spin_lock(&ses->iface_lock); + iface_count = ses->iface_count; + if (iface_count <= 0) { + spin_unlock(&ses->iface_lock); + cifs_dbg(FYI, "no iface list available to open channels\n"); + return 0; + } + ifaces = kmemdup(ses->iface_list, iface_count*sizeof(*ifaces), + GFP_ATOMIC); + if (!ifaces) { + spin_unlock(&ses->iface_lock); + return 0; + } + spin_unlock(&ses->iface_lock); + + /* * Keep connecting to same, fastest, iface for all channels as * long as its RSS. Try next fastest one if not RSS or channel * creation fails. @@ -105,9 +127,9 @@ int cifs_try_adding_channels(struct cifs_ses *ses) break; } - iface = &ses->iface_list[i]; + iface = &ifaces[i]; if (is_ses_using_iface(ses, iface) && !iface->rss_capable) { - i = (i+1) % ses->iface_count; + i = (i+1) % iface_count; continue; } @@ -115,7 +137,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses) if (rc) { cifs_dbg(FYI, "failed to open extra channel on iface#%d rc=%d\n", i, rc); - i = (i+1) % ses->iface_count; + i = (i+1) % iface_count; continue; } @@ -124,6 +146,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses) left--; } + kfree(ifaces); return ses->chan_count - old_chan_count; } @@ -213,6 +236,9 @@ cifs_ses_add_channel(struct cifs_ses *ses, struct cifs_server_iface *iface) chan->server = NULL; goto out; } + spin_lock(&cifs_tcp_ses_lock); + chan->server->is_channel = true; + spin_unlock(&cifs_tcp_ses_lock); /* * We need to allocate the server crypto now as we will need diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 4121ac1163ca..5ef5e97a6d13 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -95,6 +95,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, goto finished; } + memset(&oparms, 0, sizeof(struct cifs_open_parms)); oparms.tcon = tcon; oparms.desired_access = desired_access; oparms.disposition = create_disposition; @@ -313,7 +314,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, rqst[num_rqst].rq_iov = close_iov; rqst[num_rqst].rq_nvec = 1; rc = SMB2_close_init(tcon, &rqst[num_rqst], COMPOUND_FID, - COMPOUND_FID); + COMPOUND_FID, false); smb2_set_related(&rqst[num_rqst]); if (rc) goto finished; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index a7f328f79c6f..6250370c1170 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -616,6 +616,7 @@ smb2_close_cached_fid(struct kref *ref) cfid->fid->volatile_fid); cfid->is_valid = false; cfid->file_all_info_is_valid = false; + cfid->has_lease = false; } } @@ -626,13 +627,28 @@ void close_shroot(struct cached_fid *cfid) mutex_unlock(&cfid->fid_mutex); } +void close_shroot_lease_locked(struct cached_fid *cfid) +{ + if (cfid->has_lease) { + cfid->has_lease = false; + kref_put(&cfid->refcount, smb2_close_cached_fid); + } +} + +void close_shroot_lease(struct cached_fid *cfid) +{ + mutex_lock(&cfid->fid_mutex); + close_shroot_lease_locked(cfid); + mutex_unlock(&cfid->fid_mutex); +} + void smb2_cached_lease_break(struct work_struct *work) { struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_shroot(cfid); + close_shroot_lease(cfid); } /* @@ -773,6 +789,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { kref_get(&tcon->crfid.refcount); + tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, &oparms.fid->epoch, oparms.fid->lease_key, &oplock, NULL); @@ -1178,7 +1195,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, memset(&close_iov, 0, sizeof(close_iov)); rqst[2].rq_iov = close_iov; rqst[2].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID); + rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); smb2_set_related(&rqst[2]); rc = compound_send_recv(xid, ses, flags, 3, rqst, @@ -1332,6 +1349,45 @@ smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon, SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); } +static void +smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + struct smb2_file_network_open_info file_inf; + struct inode *inode; + int rc; + + rc = __SMB2_close(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, &file_inf); + if (rc) + return; + + inode = d_inode(cfile->dentry); + + spin_lock(&inode->i_lock); + CIFS_I(inode)->time = jiffies; + + /* Creation time should not need to be updated on close */ + if (file_inf.LastWriteTime) + inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime); + if (file_inf.ChangeTime) + inode->i_ctime = cifs_NTtimeToUnix(file_inf.ChangeTime); + if (file_inf.LastAccessTime) + inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime); + + /* + * i_blocks is not related to (i_size / i_blksize), + * but instead 512 byte (2**9) size is required for + * calculating num blocks. + */ + if (le64_to_cpu(file_inf.AllocationSize) > 4096) + inode->i_blocks = + (512 - 1 + le64_to_cpu(file_inf.AllocationSize)) >> 9; + + /* End of file and Attributes should not have to be updated on close */ + spin_unlock(&inode->i_lock); +} + static int SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, @@ -1512,7 +1568,7 @@ smb2_ioctl_query_info(const unsigned int xid, rqst[2].rq_iov = close_iov; rqst[2].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID); + rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) goto iqinf_exit; smb2_set_related(&rqst[2]); @@ -2241,7 +2297,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, rqst[2].rq_iov = close_iov; rqst[2].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID); + rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) goto qic_exit; smb2_set_related(&rqst[2]); @@ -2654,7 +2710,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rqst[2].rq_iov = close_iov; rqst[2].rq_nvec = 1; - rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID); + rc = SMB2_close_init(tcon, &rqst[2], COMPOUND_FID, COMPOUND_FID, false); if (rc) goto querty_exit; @@ -4707,6 +4763,7 @@ struct smb_version_operations smb30_operations = { .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, + .close_getattr = smb2_close_getattr, .flush = smb2_flush_file, .async_readv = smb2_async_readv, .async_writev = smb2_async_writev, @@ -4816,6 +4873,7 @@ struct smb_version_operations smb311_operations = { .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, + .close_getattr = smb2_close_getattr, .flush = smb2_flush_file, .async_readv = smb2_async_readv, .async_writev = smb2_async_writev, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index ed77f94dbf1d..9434f6dd8df3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -554,7 +554,7 @@ static void assemble_neg_contexts(struct smb2_negotiate_req *req, struct TCP_Server_Info *server, unsigned int *total_len) { - char *pneg_ctxt = (char *)req; + char *pneg_ctxt; unsigned int ctxt_len; if (*total_len > 200) { @@ -1847,7 +1847,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - close_shroot(&tcon->crfid); + close_shroot_lease(&tcon->crfid); rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, (void **) &req, &total_len); @@ -2191,6 +2191,72 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp) return 0; } +/* See MS-SMB2 2.2.13.2.2 and MS-DTYP 2.4.6 */ +static struct crt_sd_ctxt * +create_sd_buf(umode_t mode, unsigned int *len) +{ + struct crt_sd_ctxt *buf; + struct cifs_ace *pace; + unsigned int sdlen, acelen; + + *len = roundup(sizeof(struct crt_sd_ctxt) + sizeof(struct cifs_ace), 8); + buf = kzalloc(*len, GFP_KERNEL); + if (buf == NULL) + return buf; + + sdlen = sizeof(struct smb3_sd) + sizeof(struct smb3_acl) + + sizeof(struct cifs_ace); + + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct crt_sd_ctxt, sd)); + buf->ccontext.DataLength = cpu_to_le32(sdlen); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct crt_sd_ctxt, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_SD_BUFFER_TOKEN is "SecD" */ + buf->Name[0] = 'S'; + buf->Name[1] = 'e'; + buf->Name[2] = 'c'; + buf->Name[3] = 'D'; + buf->sd.Revision = 1; /* Must be one see MS-DTYP 2.4.6 */ + /* + * ACL is "self relative" ie ACL is stored in contiguous block of memory + * and "DP" ie the DACL is present + */ + buf->sd.Control = cpu_to_le16(ACL_CONTROL_SR | ACL_CONTROL_DP); + + /* offset owner, group and Sbz1 and SACL are all zero */ + buf->sd.OffsetDacl = cpu_to_le32(sizeof(struct smb3_sd)); + buf->acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ + + /* create one ACE to hold the mode embedded in reserved special SID */ + pace = (struct cifs_ace *)(sizeof(struct crt_sd_ctxt) + (char *)buf); + acelen = setup_special_mode_ACE(pace, (__u64)mode); + buf->acl.AclSize = cpu_to_le16(sizeof(struct cifs_acl) + acelen); + buf->acl.AceCount = cpu_to_le16(1); + return buf; +} + +static int +add_sd_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) +{ + struct smb2_create_req *req = iov[0].iov_base; + unsigned int num = *num_iovec; + unsigned int len = 0; + + iov[num].iov_base = create_sd_buf(mode, &len); + if (iov[num].iov_base == NULL) + return -ENOMEM; + iov[num].iov_len = len; + if (!req->CreateContextsOffset) + req->CreateContextsOffset = cpu_to_le32( + sizeof(struct smb2_create_req) + + iov[num - 1].iov_len); + le32_add_cpu(&req->CreateContextsLength, len); + *num_iovec = num + 1; + return 0; +} + static struct crt_query_id_ctxt * create_query_id_buf(void) { @@ -2563,7 +2629,9 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, return rc; } - if ((oparms->disposition == FILE_CREATE) && + if ((oparms->disposition != FILE_OPEN) && + (oparms->cifs_sb) && + (oparms->cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) && (oparms->mode != ACL_NO_MODE)) { if (n_iov > 2) { struct create_context *ccontext = @@ -2572,7 +2640,8 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, cpu_to_le32(iov[n_iov-1].iov_len); } - /* rc = add_sd_context(iov, &n_iov, oparms->mode); */ + cifs_dbg(FYI, "add sd with mode 0x%x\n", oparms->mode); + rc = add_sd_context(iov, &n_iov, oparms->mode); if (rc) return rc; } @@ -2932,7 +3001,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, int SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, - u64 persistent_fid, u64 volatile_fid) + u64 persistent_fid, u64 volatile_fid, bool query_attrs) { struct smb2_close_req *req; struct kvec *iov = rqst->rq_iov; @@ -2945,6 +3014,10 @@ SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; + if (query_attrs) + req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB; + else + req->Flags = 0; iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; @@ -2959,8 +3032,9 @@ SMB2_close_free(struct smb_rqst *rqst) } int -SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, int flags) +__SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_network_open_info *pbuf) { struct smb_rqst rqst; struct smb2_close_rsp *rsp = NULL; @@ -2969,6 +3043,8 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, struct kvec rsp_iov; int resp_buftype = CIFS_NO_BUFFER; int rc = 0; + int flags = 0; + bool query_attrs = false; cifs_dbg(FYI, "Close\n"); @@ -2983,8 +3059,13 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, rqst.rq_iov = iov; rqst.rq_nvec = 1; + /* check if need to ask server to return timestamps in close response */ + if (pbuf) + query_attrs = true; + trace_smb3_close_enter(xid, persistent_fid, tcon->tid, ses->Suid); - rc = SMB2_close_init(tcon, &rqst, persistent_fid, volatile_fid); + rc = SMB2_close_init(tcon, &rqst, persistent_fid, volatile_fid, + query_attrs); if (rc) goto close_exit; @@ -2996,42 +3077,43 @@ SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, trace_smb3_close_err(xid, persistent_fid, tcon->tid, ses->Suid, rc); goto close_exit; - } else + } else { trace_smb3_close_done(xid, persistent_fid, tcon->tid, ses->Suid); + /* + * Note that have to subtract 4 since struct network_open_info + * has a final 4 byte pad that close response does not have + */ + if (pbuf) + memcpy(pbuf, (char *)&rsp->CreationTime, sizeof(*pbuf) - 4); + } atomic_dec(&tcon->num_remote_opens); - - /* BB FIXME - decode close response, update inode for caching */ - close_exit: SMB2_close_free(&rqst); free_rsp_buf(resp_buftype, rsp); - return rc; -} - -int -SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid) -{ - int rc; - int tmp_rc; - - rc = SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0); /* retry close in a worker thread if this one is interrupted */ if (rc == -EINTR) { + int tmp_rc; + tmp_rc = smb2_handle_cancelled_close(tcon, persistent_fid, volatile_fid); if (tmp_rc) cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n", persistent_fid, tmp_rc); } - return rc; } int +SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid) +{ + return __SMB2_close(xid, tcon, persistent_fid, volatile_fid, NULL); +} + +int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index f264e1d36fe1..7b1c379fdf7a 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -25,6 +25,7 @@ #define _SMB2PDU_H #include <net/sock.h> +#include <cifsacl.h> /* * Note that, due to trying to use names similar to the protocol specifications, @@ -855,6 +856,15 @@ struct crt_query_id_ctxt { __u8 Name[8]; } __packed; +struct crt_sd_ctxt { + struct create_context ccontext; + __u8 Name[8]; + struct smb3_sd sd; + struct smb3_acl acl; + /* Followed by at least 4 ACEs */ +} __packed; + + #define COPY_CHUNK_RES_KEY_SIZE 24 struct resume_key_req { char ResumeKey[COPY_CHUNK_RES_KEY_SIZE]; @@ -1570,6 +1580,17 @@ struct smb2_file_eof_info { /* encoding of request for level 10 */ __le64 EndOfFile; /* new end of file value */ } __packed; /* level 20 Set */ +struct smb2_file_network_open_info { + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; + __le64 EndOfFile; + __le32 Attributes; + __le32 Reserved; +} __packed; /* level 34 Query also similar returned in close rsp and open rsp */ + extern char smb2_padding[7]; #endif /* _SMB2PDU_H */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index d21a5fcc8d06..27d29f2eb6c8 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -70,6 +70,8 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server, extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid); extern void close_shroot(struct cached_fid *cfid); +extern void close_shroot_lease(struct cached_fid *cfid); +extern void close_shroot_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, @@ -155,12 +157,13 @@ extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, bool watch_tree, u32 completion_filter); +extern int __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_network_open_info *pbuf); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); -extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, int flags); extern int SMB2_close_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, - u64 persistent_file_id, u64 volatile_file_id); + u64 persistent_fid, u64 volatile_fid, bool query_attrs); extern void SMB2_close_free(struct smb_rqst *rqst); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 040df1f5e1c8..40cca351273f 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -151,7 +151,7 @@ static struct key *search_fscrypt_keyring(struct key *keyring, } #define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \ - (CONST_STRLEN("fscrypt-") + FIELD_SIZEOF(struct super_block, s_id)) + (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id)) #define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1) diff --git a/fs/dcache.c b/fs/dcache.c index f7931b682a0d..b280e07e162b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -319,7 +319,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry, flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); flags |= type_flags; - WRITE_ONCE(dentry->d_flags, flags); + smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) @@ -903,17 +903,19 @@ struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; + unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); + seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { - if (likely(ret == READ_ONCE(dentry->d_parent))) + if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } @@ -1679,7 +1681,7 @@ EXPORT_SYMBOL(d_invalidate); * copied and the copy passed in may be reused after this call. */ -struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) +static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7b975dbb2bb4..f4d8df5e4714 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -299,13 +299,9 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - dentry = lookup_one_len_unlocked(name, parent, strlen(name)); + dentry = lookup_positive_unlocked(name, parent, strlen(name)); if (IS_ERR(dentry)) return NULL; - if (!d_really_is_positive(dentry)) { - dput(dentry); - return NULL; - } return dentry; } EXPORT_SYMBOL_GPL(debugfs_lookup); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d31b6c72b476..dc1a1d5d825b 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -35,11 +35,11 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); - cond_resched(); invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a13a78725c57..b766c3ee5fa8 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -649,6 +649,8 @@ ssize_t erofs_listxattr(struct dentry *dentry, struct listxattr_iter it; ret = init_inode_xattrs(d_inode(dentry)); + if (ret == -ENOATTR) + return 0; if (ret) return ret; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c4159bcc05d9..67a395039268 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -551,28 +551,23 @@ out_unlock: */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct nested_calls poll_safewake_ncalls; - -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) -{ - unsigned long flags; - wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; - - spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); - wake_up_locked_poll(wqueue, EPOLLIN); - spin_unlock_irqrestore(&wqueue->lock, flags); - - return 0; -} +static DEFINE_PER_CPU(int, wakeup_nest); static void ep_poll_safewake(wait_queue_head_t *wq) { - int this_cpu = get_cpu(); - - ep_call_nested(&poll_safewake_ncalls, - ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); + unsigned long flags; + int subclass; - put_cpu(); + local_irq_save(flags); + preempt_disable(); + subclass = __this_cpu_read(wakeup_nest); + spin_lock_nested(&wq->lock, subclass + 1); + __this_cpu_inc(wakeup_nest); + wake_up_locked_poll(wq, POLLIN); + __this_cpu_dec(wakeup_nest); + spin_unlock(&wq->lock); + local_irq_restore(flags); + preempt_enable(); } #else @@ -671,7 +666,6 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, void *priv, int depth, bool ep_locked) { __poll_t res; - int pwake = 0; struct epitem *epi, *nepi; LIST_HEAD(txlist); @@ -738,26 +732,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, */ list_splice(&txlist, &ep->rdllist); __pm_relax(ep->ws); - - if (!list_empty(&ep->rdllist)) { - /* - * Wake up (if active) both the eventpoll wait list and - * the ->poll() wait list (delayed after we release the lock). - */ - if (waitqueue_active(&ep->wq)) - wake_up(&ep->wq); - if (waitqueue_active(&ep->poll_wait)) - pwake++; - } write_unlock_irq(&ep->lock); if (!ep_locked) mutex_unlock(&ep->mtx); - /* We have to call this outside the lock */ - if (pwake) - ep_poll_safewake(&ep->poll_wait); - return res; } @@ -2370,11 +2349,6 @@ static int __init eventpoll_init(void) */ ep_nested_calls_init(&poll_loop_ncalls); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* Initialize the structure used to perform safe poll wait head wake ups */ - ep_nested_calls_init(&poll_safewake_ncalls); -#endif - /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index d4d4fdfac1a6..1ee04e76bbe0 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -133,10 +133,13 @@ static void debug_print_tree(struct ext4_sb_info *sbi) { struct rb_node *node; struct ext4_system_zone *entry; + struct ext4_system_blocks *system_blks; int first = 1; printk(KERN_INFO "System zones: "); - node = rb_first(&sbi->system_blks->root); + rcu_read_lock(); + system_blks = rcu_dereference(sbi->system_blks); + node = rb_first(&system_blks->root); while (node) { entry = rb_entry(node, struct ext4_system_zone, node); printk(KERN_CONT "%s%llu-%llu", first ? "" : ", ", @@ -144,6 +147,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) first = 0; node = rb_next(node); } + rcu_read_unlock(); printk(KERN_CONT "\n"); } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 9fdd2b269d61..9f00fc0bf21d 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -72,6 +72,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); + const int next_offset = ((char *) de - buf) + rlen; if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) error_msg = "rec_len is smaller than minimal"; @@ -79,8 +80,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, error_msg = "rec_len % 4 != 0"; else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) error_msg = "rec_len is too small for name_len"; - else if (unlikely(((char *) de - buf) + rlen > size)) + else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; + else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && + next_offset != size)) + error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index dc333e8e51e8..8ca4a23129aa 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -921,8 +921,8 @@ repeat_in_this_group: if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, - handle_type, nblocks, - 0, 0); + handle_type, nblocks, 0, + ext4_trans_default_revoke_credits(sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); ext4_std_error(sb, err); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index 92a9da1774aa..bbce1c328d85 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -25,7 +25,7 @@ * For constructing the negative timestamp lower bound value. * binary: 10000000 00000000 00000000 00000000 */ -#define LOWER_MSB_1 (-0x80000000L) +#define LOWER_MSB_1 (-(UPPER_MSB_0) - 1L) /* avoid overflow */ /* * For constructing the negative timestamp upper bound value. * binary: 11111111 11111111 11111111 11111111 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 28f28de0c1b6..629a25d999f0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5692,7 +5692,7 @@ int ext4_expand_extra_isize(struct inode *inode, error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); - goto out_stop; + goto out_unlock; } error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc, @@ -5702,8 +5702,8 @@ int ext4_expand_extra_isize(struct inode *inode, if (!error) error = rc; +out_unlock: ext4_write_unlock_xattr(inode, &no_expand); -out_stop: ext4_journal_stop(handle); return error; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a856997d87b5..1cb42d940784 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2164,7 +2164,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; +#ifdef CONFIG_UNICODE struct ext4_sb_info *sbi; +#endif struct ext4_filename fname; int retval; int dx_fallback=0; @@ -2176,12 +2178,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; - sbi = EXT4_SB(sb); blocksize = sb->s_blocksize; if (!dentry->d_name.len) return -EINVAL; #ifdef CONFIG_UNICODE + sbi = EXT4_SB(sb); if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) return -EINVAL; @@ -2822,7 +2824,7 @@ bool ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; - struct ext4_dir_entry_2 *de, *de1; + struct ext4_dir_entry_2 *de; struct super_block *sb; if (ext4_has_inline_data(inode)) { @@ -2847,19 +2849,25 @@ bool ext4_empty_dir(struct inode *inode) return true; de = (struct ext4_dir_entry_2 *) bh->b_data; - de1 = ext4_next_entry(de, sb->s_blocksize); - if (le32_to_cpu(de->inode) != inode->i_ino || - le32_to_cpu(de1->inode) == 0 || - strcmp(".", de->name) || strcmp("..", de1->name)) { - ext4_warning_inode(inode, "directory missing '.' and/or '..'"); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + 0) || + le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { + ext4_warning_inode(inode, "directory missing '.'"); + brelse(bh); + return true; + } + offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); + de = ext4_next_entry(de, sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, + offset) || + le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + ext4_warning_inode(inode, "directory missing '..'"); brelse(bh); return true; } - offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) + - ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize); - de = ext4_next_entry(de1, sb->s_blocksize); + offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); while (offset < inode->i_size) { - if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + if (!(offset & (sb->s_blocksize - 1))) { unsigned int lblock; brelse(bh); lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb); @@ -2870,12 +2878,11 @@ bool ext4_empty_dir(struct inode *inode) } if (IS_ERR(bh)) return true; - de = (struct ext4_dir_entry_2 *) bh->b_data; } + de = (struct ext4_dir_entry_2 *) (bh->b_data + + (offset & (sb->s_blocksize - 1))); if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, offset)) { - de = (struct ext4_dir_entry_2 *)(bh->b_data + - sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; continue; } @@ -2884,7 +2891,6 @@ bool ext4_empty_dir(struct inode *inode) return false; } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); - de = ext4_next_entry(de, sb->s_blocksize); } brelse(bh); return true; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1d82b56d9b11..2937a8873fe1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1900,6 +1900,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_commit_interval = HZ * arg; } else if (token == Opt_debug_want_extra_isize) { + if ((arg & 1) || + (arg < 4) || + (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) { + ext4_msg(sb, KERN_ERR, + "Invalid want_extra_isize %d", arg); + return -1; + } sbi->s_want_extra_isize = arg; } else if (token == Opt_max_batch_time) { sbi->s_max_batch_time = arg; @@ -3554,40 +3561,6 @@ int ext4_calculate_overhead(struct super_block *sb) return 0; } -static void ext4_clamp_want_extra_isize(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = sbi->s_es; - unsigned def_extra_isize = sizeof(struct ext4_inode) - - EXT4_GOOD_OLD_INODE_SIZE; - - if (sbi->s_inode_size == EXT4_GOOD_OLD_INODE_SIZE) { - sbi->s_want_extra_isize = 0; - return; - } - if (sbi->s_want_extra_isize < 4) { - sbi->s_want_extra_isize = def_extra_isize; - if (ext4_has_feature_extra_isize(sb)) { - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_want_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_want_extra_isize); - if (sbi->s_want_extra_isize < - le16_to_cpu(es->s_min_extra_isize)) - sbi->s_want_extra_isize = - le16_to_cpu(es->s_min_extra_isize); - } - } - /* Check if enough inode space is available */ - if ((sbi->s_want_extra_isize > sbi->s_inode_size) || - (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize > - sbi->s_inode_size)) { - sbi->s_want_extra_isize = def_extra_isize; - ext4_msg(sb, KERN_INFO, - "required extra inode space not available"); - } -} - static void ext4_set_resv_clusters(struct super_block *sb) { ext4_fsblk_t resv_clusters; @@ -3795,6 +3768,68 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { + sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { + ext4_msg(sb, KERN_ERR, "invalid first ino: %u", + sbi->s_first_ino); + goto failed_mount; + } + if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || + (!is_power_of_2(sbi->s_inode_size)) || + (sbi->s_inode_size > blocksize)) { + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", + sbi->s_inode_size); + goto failed_mount; + } + /* + * i_atime_extra is the last extra field available for + * [acm]times in struct ext4_inode. Checking for that + * field should suffice to ensure we have extra space + * for all three. + */ + if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + + sizeof(((struct ext4_inode *)0)->i_atime_extra)) { + sb->s_time_gran = 1; + sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; + } else { + sb->s_time_gran = NSEC_PER_SEC; + sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; + } + sb->s_time_min = EXT4_TIMESTAMP_MIN; + } + if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { + sbi->s_want_extra_isize = sizeof(struct ext4_inode) - + EXT4_GOOD_OLD_INODE_SIZE; + if (ext4_has_feature_extra_isize(sb)) { + unsigned v, max = (sbi->s_inode_size - + EXT4_GOOD_OLD_INODE_SIZE); + + v = le16_to_cpu(es->s_want_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_want_extra_isize: %d", v); + goto failed_mount; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + + v = le16_to_cpu(es->s_min_extra_isize); + if (v > max) { + ext4_msg(sb, KERN_ERR, + "bad s_min_extra_isize: %d", v); + goto failed_mount; + } + if (sbi->s_want_extra_isize < v) + sbi->s_want_extra_isize = v; + } + } + if (sbi->s_es->s_mount_opts[0]) { char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, sizeof(sbi->s_es->s_mount_opts), @@ -4033,42 +4068,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) has_huge_files); sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); - if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { - sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; - sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO; - } else { - sbi->s_inode_size = le16_to_cpu(es->s_inode_size); - sbi->s_first_ino = le32_to_cpu(es->s_first_ino); - if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) { - ext4_msg(sb, KERN_ERR, "invalid first ino: %u", - sbi->s_first_ino); - goto failed_mount; - } - if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || - (!is_power_of_2(sbi->s_inode_size)) || - (sbi->s_inode_size > blocksize)) { - ext4_msg(sb, KERN_ERR, - "unsupported inode size: %d", - sbi->s_inode_size); - goto failed_mount; - } - /* - * i_atime_extra is the last extra field available for [acm]times in - * struct ext4_inode. Checking for that field should suffice to ensure - * we have extra space for all three. - */ - if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) + - sizeof(((struct ext4_inode *)0)->i_atime_extra)) { - sb->s_time_gran = 1; - sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX; - } else { - sb->s_time_gran = NSEC_PER_SEC; - sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX; - } - - sb->s_time_min = EXT4_TIMESTAMP_MIN; - } - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (ext4_has_feature_64bit(sb)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || @@ -4517,8 +4516,6 @@ no_journal: } else if (ret) goto failed_mount4a; - ext4_clamp_want_extra_isize(sb); - ext4_set_resv_clusters(sb); err = ext4_setup_system_zone(sb); @@ -5306,8 +5303,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - ext4_clamp_want_extra_isize(sb); - if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " diff --git a/fs/fcntl.c b/fs/fcntl.c index 41b6438bd2d9..9bc167562ee8 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -277,7 +277,7 @@ static long fcntl_rw_hint(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(file); - u64 *argp = (u64 __user *)arg; + u64 __user *argp = (u64 __user *)arg; enum rw_hint hint; u64 h; diff --git a/fs/file.c b/fs/file.c index 3da91a112bab..2f4fcf985079 100644 --- a/fs/file.c +++ b/fs/file.c @@ -960,7 +960,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) return ksys_dup3(oldfd, newfd, 0); } -int ksys_dup(unsigned int fildes) +SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); @@ -975,11 +975,6 @@ int ksys_dup(unsigned int fildes) return ret; } -SYSCALL_DEFINE1(dup, unsigned int, fildes) -{ - return ksys_dup(fildes); -} - int f_dupfd(unsigned int from, struct file *file, unsigned flags) { int err; diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0635cba19971..eb2a585572dc 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -34,7 +34,7 @@ config VIRTIO_FS select VIRTIO help The Virtio Filesystem allows guests to mount file systems from the - host. + host. If you want to share files between guests or with the host, answer Y - or M. + or M. diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d4e6691d2d92..8e02d76fe104 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1965,7 +1965,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, nbuf = 0; rem = 0; - for (idx = tail; idx < head && rem < len; idx++) + for (idx = tail; idx != head && rem < len; idx++) rem += pipe->bufs[idx & mask].len; ret = -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 54d638f9ba1c..ee190119f45c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -248,7 +248,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) kfree(forget); if (ret == -ENOMEM) goto out; - if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) + if (ret || fuse_invalid_attr(&outarg.attr) || + (outarg.attr.mode ^ inode->i_mode) & S_IFMT) goto invalid; forget_all_cached_acls(inode); @@ -319,6 +320,12 @@ int fuse_valid_type(int m) S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } +bool fuse_invalid_attr(struct fuse_attr *attr) +{ + return !fuse_valid_type(attr->mode) || + attr->size > LLONG_MAX; +} + int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { @@ -350,7 +357,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name err = -EIO; if (!outarg->nodeid) goto out_put_forget; - if (!fuse_valid_type(outarg->attr.mode)) + if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, @@ -475,7 +482,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_free_ff; err = -EIO; - if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid)) + if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) || + fuse_invalid_attr(&outentry.attr)) goto out_free_ff; ff->fh = outopen.fh; @@ -583,7 +591,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, goto out_put_forget_req; err = -EIO; - if (invalid_nodeid(outarg.nodeid)) + if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr)) goto out_put_forget_req; if ((outarg.attr.mode ^ mode) & S_IFMT) @@ -862,7 +870,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); - inc_nlink(inode); + if (likely(inode->i_nlink < UINT_MAX)) + inc_nlink(inode); spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_update_ctime(inode); @@ -942,7 +951,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) { - if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + if (fuse_invalid_attr(&outarg.attr) || + (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); err = -EIO; } else { @@ -1563,7 +1573,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, goto error; } - if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { + if (fuse_invalid_attr(&outarg.attr) || + (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); err = -EIO; goto error; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index db48a5cf8620..a63d779eac10 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -713,8 +713,10 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, ia->ap.args.end = fuse_aio_complete_req; err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); + if (err) + fuse_aio_complete_req(fc, &ia->ap.args, err); - return err ?: num_bytes; + return num_bytes; } static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, @@ -1096,6 +1098,8 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, ia->write.in.flags = fuse_write_flags(iocb); err = fuse_simple_request(fc, &ap->args); + if (!err && ia->write.out.size > count) + err = -EIO; offset = ap->descs[0].offset; count = ia->write.out.size; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d148188cfca4..aa75e2305b75 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -989,6 +989,8 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc); */ int fuse_valid_type(int m); +bool fuse_invalid_attr(struct fuse_attr *attr); + /** * Is current process allowed to perform filesystem operation? */ diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 5c38b9d84c6e..6a40f75a0d25 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -184,7 +184,7 @@ static int fuse_direntplus_link(struct file *file, if (invalid_nodeid(o->nodeid)) return -EIO; - if (!fuse_valid_type(o->attr.mode)) + if (fuse_invalid_attr(&o->attr)) return -EIO; fc = get_fuse_conn(dir); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index a5c86048b96e..bade74768903 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -35,6 +35,7 @@ struct virtio_fs_vq { struct fuse_dev *fud; bool connected; long in_flight; + struct completion in_flight_zero; /* No inflight requests */ char name[24]; } ____cacheline_aligned_in_smp; @@ -48,11 +49,15 @@ struct virtio_fs { unsigned int num_request_queues; /* number of request queues */ }; -struct virtio_fs_forget { +struct virtio_fs_forget_req { struct fuse_in_header ih; struct fuse_forget_in arg; +}; + +struct virtio_fs_forget { /* This request can be temporarily queued on virt queue */ struct list_head list; + struct virtio_fs_forget_req req; }; static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, @@ -81,6 +86,8 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) { WARN_ON(fsvq->in_flight <= 0); fsvq->in_flight--; + if (!fsvq->in_flight) + complete(&fsvq->in_flight_zero); } static void release_virtio_fs_obj(struct kref *ref) @@ -111,22 +118,23 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) WARN_ON(fsvq->in_flight < 0); /* Wait for in flight requests to finish.*/ - while (1) { - spin_lock(&fsvq->lock); - if (!fsvq->in_flight) { - spin_unlock(&fsvq->lock); - break; - } + spin_lock(&fsvq->lock); + if (fsvq->in_flight) { + /* We are holding virtio_fs_mutex. There should not be any + * waiters waiting for completion. + */ + reinit_completion(&fsvq->in_flight_zero); + spin_unlock(&fsvq->lock); + wait_for_completion(&fsvq->in_flight_zero); + } else { spin_unlock(&fsvq->lock); - /* TODO use completion instead of timeout */ - usleep_range(1000, 2000); } flush_work(&fsvq->done_work); flush_delayed_work(&fsvq->dispatch_work); } -static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; int i; @@ -137,6 +145,19 @@ static void virtio_fs_drain_all_queues(struct virtio_fs *fs) } } +static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +{ + /* Provides mutual exclusion between ->remove and ->kill_sb + * paths. We don't want both of these draining queue at the + * same time. Current completion logic reinits completion + * and that means there should not be any other thread + * doing reinit or waiting for completion already. + */ + mutex_lock(&virtio_fs_mutex); + virtio_fs_drain_all_queues_locked(fs); + mutex_unlock(&virtio_fs_mutex); +} + static void virtio_fs_start_all_queues(struct virtio_fs *fs) { struct virtio_fs_vq *fsvq; @@ -313,17 +334,72 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) } } +/* + * Returns 1 if queue is full and sender should wait a bit before sending + * next request, 0 otherwise. + */ +static int send_forget_request(struct virtio_fs_vq *fsvq, + struct virtio_fs_forget *forget, + bool in_flight) +{ + struct scatterlist sg; + struct virtqueue *vq; + int ret = 0; + bool notify; + struct virtio_fs_forget_req *req = &forget->req; + + spin_lock(&fsvq->lock); + if (!fsvq->connected) { + if (in_flight) + dec_in_flight_req(fsvq); + kfree(forget); + goto out; + } + + sg_init_one(&sg, req, sizeof(*req)); + vq = fsvq->vq; + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + + ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", + ret); + list_add_tail(&forget->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + if (!in_flight) + inc_in_flight_req(fsvq); + /* Queue is full */ + ret = 1; + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + if (in_flight) + dec_in_flight_req(fsvq); + } + goto out; + } + + if (!in_flight) + inc_in_flight_req(fsvq); + notify = virtqueue_kick_prepare(vq); + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + return ret; +out: + spin_unlock(&fsvq->lock); + return ret; +} + static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) { struct virtio_fs_forget *forget; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work.work); - struct virtqueue *vq = fsvq->vq; - struct scatterlist sg; - struct scatterlist *sgs[] = {&sg}; - bool notify; - int ret; - pr_debug("virtio-fs: worker %s called.\n", __func__); while (1) { spin_lock(&fsvq->lock); @@ -335,43 +411,9 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) } list_del(&forget->list); - if (!fsvq->connected) { - dec_in_flight_req(fsvq); - spin_unlock(&fsvq->lock); - kfree(forget); - continue; - } - - sg_init_one(&sg, forget, sizeof(*forget)); - - /* Enqueue the request */ - dev_dbg(&vq->vdev->dev, "%s\n", __func__); - ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); - if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", - ret); - list_add_tail(&forget->list, - &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); - } else { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", - ret); - dec_in_flight_req(fsvq); - kfree(forget); - } - spin_unlock(&fsvq->lock); - return; - } - - notify = virtqueue_kick_prepare(vq); spin_unlock(&fsvq->lock); - - if (notify) - virtqueue_notify(vq); - pr_debug("virtio-fs: worker %s dispatched one forget request.\n", - __func__); + if (send_forget_request(fsvq, forget, true)) + return; } } @@ -556,6 +598,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs); INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, virtio_fs_hiprio_dispatch_work); + init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero); spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); /* Initialize the requests virtqueues */ @@ -566,6 +609,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, virtio_fs_request_dispatch_work); INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); INIT_LIST_HEAD(&fs->vqs[i].end_reqs); + init_completion(&fs->vqs[i].in_flight_zero); snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), "requests.%u", i - VQ_REQUEST); callbacks[i] = virtio_fs_vq_done; @@ -659,7 +703,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) /* This device is going away. No one should get new reference */ list_del_init(&fs->list); virtio_fs_stop_all_queues(fs); - virtio_fs_drain_all_queues(fs); + virtio_fs_drain_all_queues_locked(fs); vdev->config->reset(vdev); virtio_fs_cleanup_vqs(vdev, fs); @@ -684,12 +728,12 @@ static int virtio_fs_restore(struct virtio_device *vdev) } #endif /* CONFIG_PM_SLEEP */ -const static struct virtio_device_id id_table[] = { +static const struct virtio_device_id id_table[] = { { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, {}, }; -const static unsigned int feature_table[] = {}; +static const unsigned int feature_table[] = {}; static struct virtio_driver virtio_fs_driver = { .driver.name = KBUILD_MODNAME, @@ -710,14 +754,10 @@ __releases(fiq->lock) { struct fuse_forget_link *link; struct virtio_fs_forget *forget; - struct scatterlist sg; - struct scatterlist *sgs[] = {&sg}; + struct virtio_fs_forget_req *req; struct virtio_fs *fs; - struct virtqueue *vq; struct virtio_fs_vq *fsvq; - bool notify; u64 unique; - int ret; link = fuse_dequeue_forget(fiq, 1, NULL); unique = fuse_get_unique(fiq); @@ -728,57 +768,19 @@ __releases(fiq->lock) /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); + req = &forget->req; - forget->ih = (struct fuse_in_header){ + req->ih = (struct fuse_in_header){ .opcode = FUSE_FORGET, .nodeid = link->forget_one.nodeid, .unique = unique, - .len = sizeof(*forget), + .len = sizeof(*req), }; - forget->arg = (struct fuse_forget_in){ + req->arg = (struct fuse_forget_in){ .nlookup = link->forget_one.nlookup, }; - sg_init_one(&sg, forget, sizeof(*forget)); - - /* Enqueue the request */ - spin_lock(&fsvq->lock); - - if (!fsvq->connected) { - kfree(forget); - spin_unlock(&fsvq->lock); - goto out; - } - - vq = fsvq->vq; - dev_dbg(&vq->vdev->dev, "%s\n", __func__); - - ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); - if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n", - ret); - list_add_tail(&forget->list, &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); - inc_in_flight_req(fsvq); - } else { - pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", - ret); - kfree(forget); - } - spin_unlock(&fsvq->lock); - goto out; - } - - inc_in_flight_req(fsvq); - notify = virtqueue_kick_prepare(vq); - - spin_unlock(&fsvq->lock); - - if (notify) - virtqueue_notify(vq); -out: + send_forget_request(fsvq, forget, false); kfree(link); } @@ -1026,7 +1028,7 @@ __releases(fiq->lock) } } -const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { +static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index b9fe975d7625..9c6df721321a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -133,7 +133,7 @@ static int gfs2_write_full_page(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - offset = i_size & (PAGE_SIZE-1); + offset = i_size & (PAGE_SIZE - 1); if (page->index == end_index && offset) zero_user_segment(page, offset, PAGE_SIZE); @@ -497,7 +497,7 @@ static int __gfs2_readpage(void *file, struct page *page) error = mpage_readpage(page, gfs2_block_map); } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; return error; @@ -614,7 +614,7 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) ret = -EIO; return ret; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 516103248272..08f6fbb3655e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2441,8 +2441,16 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int blocksize = i_blocksize(inode); + loff_t start, end; int error; + start = round_down(offset, blocksize); + end = round_up(offset + length, blocksize) - 1; + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + if (gfs2_is_jdata(ip)) error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, GFS2_JTRUNC_REVOKES); @@ -2456,9 +2464,8 @@ int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) if (error) goto out; } else { - unsigned int start_off, end_len, blocksize; + unsigned int start_off, end_len; - blocksize = i_blocksize(inode); start_off = offset & (blocksize - 1); end_len = (offset + length) & (blocksize - 1); if (start_off) { diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index d07a295f9cac..9d58295ccf7a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -407,27 +407,28 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) /** * gfs2_allocate_page_backing - Allocate blocks for a write fault * @page: The (locked) page to allocate backing for + * @length: Size of the allocation * * We try to allocate all the blocks required for the page in one go. This * might fail for various reasons, so we keep trying until all the blocks to * back this page are allocated. If some of the blocks are already allocated, * that is ok too. */ -static int gfs2_allocate_page_backing(struct page *page) +static int gfs2_allocate_page_backing(struct page *page, unsigned int length) { u64 pos = page_offset(page); - u64 size = PAGE_SIZE; do { struct iomap iomap = { }; - if (gfs2_iomap_get_alloc(page->mapping->host, pos, 1, &iomap)) + if (gfs2_iomap_get_alloc(page->mapping->host, pos, length, &iomap)) return -EIO; - iomap.length = min(iomap.length, size); - size -= iomap.length; + if (length < iomap.length) + iomap.length = length; + length -= iomap.length; pos += iomap.length; - } while (size > 0); + } while (length > 0); return 0; } @@ -448,10 +449,10 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; - unsigned long last_index; - u64 pos = page_offset(page); + u64 offset = page_offset(page); unsigned int data_blocks, ind_blocks, rblocks; struct gfs2_holder gh; + unsigned int length; loff_t size; int ret; @@ -461,20 +462,39 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (ret) goto out; - gfs2_size_hint(vmf->vma->vm_file, pos, PAGE_SIZE); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; + /* Check page index against inode size */ + size = i_size_read(inode); + if (offset >= size) { + ret = -EINVAL; + goto out_unlock; + } + /* Update file times before taking page lock */ file_update_time(vmf->vma->vm_file); + /* page is wholly or partially inside EOF */ + if (offset > size - PAGE_SIZE) + length = offset_in_page(size); + else + length = PAGE_SIZE; + + gfs2_size_hint(vmf->vma->vm_file, offset, length); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); - if (!gfs2_write_alloc_required(ip, pos, PAGE_SIZE)) { + /* + * iomap_writepage / iomap_writepages currently don't support inline + * files, so always unstuff here. + */ + + if (!gfs2_is_stuffed(ip) && + !gfs2_write_alloc_required(ip, offset, length)) { lock_page(page); if (!PageUptodate(page) || page->mapping != inode->i_mapping) { ret = -EAGAIN; @@ -487,7 +507,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (ret) goto out_unlock; - gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + gfs2_write_calc_reserv(ip, length, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; ret = gfs2_quota_lock_check(ip, &ap); if (ret) @@ -508,13 +528,6 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) goto out_trans_fail; lock_page(page); - ret = -EINVAL; - size = i_size_read(inode); - last_index = (size - 1) >> PAGE_SHIFT; - /* Check page index against inode size */ - if (size == 0 || (page->index > last_index)) - goto out_trans_end; - ret = -EAGAIN; /* If truncated, we must retry the operation, we may have raced * with the glock demotion code. @@ -527,7 +540,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (gfs2_is_stuffed(ip)) ret = gfs2_unstuff_dinode(ip, page); if (ret == 0) - ret = gfs2_allocate_page_backing(page); + ret = gfs2_allocate_page_backing(page, length); out_trans_end: if (ret) @@ -961,6 +974,7 @@ out: brelse(dibh); return error; } + /** * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of * blocks, determine how many bytes can be written. @@ -1208,7 +1222,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) cmd = F_SETLK; fl->fl_type = F_UNLCK; } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) { + if (unlikely(gfs2_withdrawn(sdp))) { if (fl->fl_type == F_UNLCK) locks_lock_file_wait(file, fl); return -EIO; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 0290a22ebccf..b7123de7c180 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -549,7 +549,7 @@ __acquires(&gl->gl_lockref.lock) unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) && + if (unlikely(gfs2_withdrawn(sdp)) && target != LM_ST_UNLOCKED) return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | @@ -558,7 +558,14 @@ __acquires(&gl->gl_lockref.lock) GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && glops->go_inval) { - set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + /* + * If another process is already doing the invalidate, let that + * finish first. The glock state machine will get back to this + * holder again later. + */ + if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, + &gl->gl_flags)) + return; do_error(gl, 0); /* Fail queued try locks */ } gl->gl_req = target; @@ -586,8 +593,7 @@ __acquires(&gl->gl_lockref.lock) } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, !test_bit(SDF_WITHDRAWN, - &sdp->sd_flags)); + GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp)); } } else { /* lock_nolock */ finish_xmote(gl, target); @@ -1191,7 +1197,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh) struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; int error = 0; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; if (test_bit(GLF_LRU, &gl->gl_flags)) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index ff213690e364..4ede1f18de85 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -350,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), be32_to_cpu(str->di_minor)); break; - }; + } i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); @@ -540,7 +540,7 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) gfs2_consist(sdp); /* Initialize some head of the log stuff */ - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { + if (!gfs2_withdrawn(sdp)) { sdp->sd_log_sequence = head.lh_sequence + 1; gfs2_log_pointers_init(sdp, head.lh_blkno); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e1e18fb587eb..dafef10b91f1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -656,7 +656,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, inode->i_rdev = dev; inode->i_size = size; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); check_and_update_goal(dip); ip->i_goal = dip->i_goal; @@ -712,7 +711,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_trans_begin(sdp, blocks, 0); if (error) - goto fail_gunlock2; + goto fail_free_inode; if (blocks > 1) { ip->i_eattr = ip->i_no_addr + 1; @@ -723,7 +722,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) - goto fail_gunlock2; + goto fail_free_inode; BUG_ON(test_and_set_bit(GLF_INODE_CREATING, &io_gl->gl_flags)); @@ -732,7 +731,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, goto fail_gunlock2; glock_set_object(ip->i_iopen_gh.gh_gl, ip); - gfs2_glock_put(io_gl); gfs2_set_iop(inode); insert_inode_hash(inode); @@ -765,6 +763,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); d_instantiate(dentry, inode); + /* After instantiate, errors should result in evict which will destroy + * both inode and iopen glocks properly. */ if (file) { file->f_mode |= FMODE_CREATED; error = finish_open(file, dentry, gfs2_open_common); @@ -772,15 +772,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_glock_dq_uninit(ghs); gfs2_glock_dq_uninit(ghs + 1); clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + gfs2_glock_put(io_gl); return error; fail_gunlock3: glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); - gfs2_glock_put(io_gl); fail_gunlock2: - if (io_gl) - clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); + gfs2_glock_put(io_gl); fail_free_inode: if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); @@ -1475,7 +1475,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = -EEXIST; default: goto out_gunlock; - }; + } if (odip != ndip) { if (!ndip->i_inode.i_nlink) { diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 58e237fba565..eb3f2e7b8085 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -31,6 +31,8 @@ #include "dir.h" #include "trace_gfs2.h" +static void gfs2_log_shutdown(struct gfs2_sbd *sdp); + /** * gfs2_struct2blk - compute stuff * @sdp: the filesystem @@ -159,7 +161,8 @@ restart: list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw)) + if (gfs2_ail1_start_one(sdp, wbc, tr, &withdraw) && + !gfs2_withdrawn(sdp)) goto restart; } spin_unlock(&sdp->sd_ail_lock); @@ -609,6 +612,14 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) list_add(&bd->bd_list, &sdp->sd_log_revokes); } +void gfs2_glock_remove_revoke(struct gfs2_glock *gl) +{ + if (atomic_dec_return(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + gfs2_glock_queue_put(gl); + } +} + void gfs2_write_revokes(struct gfs2_sbd *sdp) { struct gfs2_trans *tr; @@ -682,12 +693,16 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, { struct gfs2_log_header *lh; u32 hash, crc; - struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + struct page *page; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct timespec64 tv; struct super_block *sb = sdp->sd_vfs; u64 dblock; + if (gfs2_withdrawn(sdp)) + goto out; + + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); lh = page_address(page); clear_page(lh); @@ -707,7 +722,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_nsec = cpu_to_be32(tv.tv_nsec); lh->lh_sec = cpu_to_be64(tv.tv_sec); if (!list_empty(&jd->extent_list)) - dblock = gfs2_log_bmap(sdp); + dblock = gfs2_log_bmap(jd, lblock); else { int ret = gfs2_lblk_to_dblk(jd->jd_inode, lblock, &dblock); if (gfs2_assert_withdraw(sdp, ret == 0)) @@ -740,6 +755,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); +out: log_flush_wait(sdp); } @@ -768,6 +784,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail, sdp->sd_log_flush_head, flags, op_flags); + gfs2_log_incr_head(sdp); if (sdp->sd_log_tail != tail) log_pull_tail(sdp, tail); @@ -948,7 +965,7 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) * */ -void gfs2_log_shutdown(struct gfs2_sbd *sdp) +static void gfs2_log_shutdown(struct gfs2_sbd *sdp) { gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 2315fca47a2b..2ff163a8dce1 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -74,9 +74,9 @@ extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); -extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); extern void gfs2_write_revokes(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 5b17979af539..55fed7daf2b1 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -129,7 +129,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, atomic_dec(&sdp->sd_log_pinned); } -static void gfs2_log_incr_head(struct gfs2_sbd *sdp) +void gfs2_log_incr_head(struct gfs2_sbd *sdp) { BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && (sdp->sd_log_flush_head != sdp->sd_log_head)); @@ -138,18 +138,13 @@ static void gfs2_log_incr_head(struct gfs2_sbd *sdp) sdp->sd_log_flush_head = 0; } -u64 gfs2_log_bmap(struct gfs2_sbd *sdp) +u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) { - unsigned int lbn = sdp->sd_log_flush_head; struct gfs2_journal_extent *je; - u64 block; - list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) { - if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) { - block = je->dblock + lbn - je->lblock; - gfs2_log_incr_head(sdp); - return block; - } + list_for_each_entry(je, &jd->extent_list, list) { + if (lblock >= je->lblock && lblock < je->lblock + je->blocks) + return je->dblock + lblock - je->lblock; } return -1; @@ -351,8 +346,11 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) { - gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), - gfs2_log_bmap(sdp)); + u64 dblock; + + dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); + gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), dblock); } /** @@ -369,8 +367,11 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) { struct super_block *sb = sdp->sd_vfs; - gfs2_log_write(sdp, page, sb->s_blocksize, 0, - gfs2_log_bmap(sdp)); + u64 dblock; + + dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); + gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); } /** @@ -882,10 +883,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) bd = list_entry(head->next, struct gfs2_bufdata, bd_list); list_del_init(&bd->bd_list); gl = bd->bd_gl; - if (atomic_dec_return(&gl->gl_revokes) == 0) { - clear_bit(GLF_LFLUSH, &gl->gl_flags); - gfs2_glock_queue_put(gl); - } + gfs2_glock_remove_revoke(gl); kmem_cache_free(gfs2_bufdata_cachep, bd); } } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c059957a733..9c5e4e491e03 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -18,7 +18,8 @@ ~(2 * sizeof(__be64) - 1)) extern const struct gfs2_log_operations *gfs2_log_ops[]; -extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp); +extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); +extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, unsigned size, unsigned offset, u64 blkno); extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 662ef36c1874..0c3772974030 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -251,7 +251,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) { + if (unlikely(gfs2_withdrawn(sdp))) { *bhp = NULL; return -EIO; } @@ -309,7 +309,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; wait_on_buffer(bh); @@ -320,7 +320,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) gfs2_io_error_bh_wd(sdp, bh); return -EIO; } - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return -EIO; return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 18daf494abab..e8b7b0ce8404 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1006,8 +1006,7 @@ hostdata_error: void gfs2_lm_unmount(struct gfs2_sbd *sdp) { const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; - if (likely(!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) && - lm->lm_unmount) + if (likely(!gfs2_withdrawn(sdp)) && lm->lm_unmount) lm->lm_unmount(sdp); } @@ -1328,7 +1327,7 @@ static const struct fs_parameter_enum gfs2_param_enums[] = { {} }; -const struct fs_parameter_description gfs2_fs_parameters = { +static const struct fs_parameter_description gfs2_fs_parameters = { .name = "gfs2", .specs = gfs2_param_specs, .enums = gfs2_param_enums, diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 7c016a082aa6..e9f93045eb01 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1273,7 +1273,7 @@ int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; - unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder); + unsigned int max_qd = PAGE_SIZE / sizeof(struct gfs2_holder); unsigned int num_qd; unsigned int x; int error = 0; @@ -1475,7 +1475,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { + if (!gfs2_withdrawn(sdp)) { fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); sdp->sd_log_error = error; wake_up(&sdp->sd_logd_waitq); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index c529f8749a89..85f830e56945 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -263,11 +263,13 @@ static void clean_journal(struct gfs2_jdesc *jd, u32 lblock = head->lh_blkno; gfs2_replay_incr_blk(jd, &lblock); - if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) - sdp->sd_log_flush_head = lblock; gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock, GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY, REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC); + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + sdp->sd_log_flush_head = lblock; + gfs2_log_incr_head(sdp); + } } @@ -326,7 +328,7 @@ void gfs2_recover_func(struct work_struct *work) default: goto fail; - }; + } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 5fa1eec4fb4f..68cc7c291a81 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -399,8 +399,7 @@ struct lfcc { * Returns: errno */ -static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, - struct gfs2_holder *freeze_gh) +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) { struct gfs2_inode *ip; struct gfs2_jdesc *jd; @@ -425,7 +424,9 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, - GL_NOCACHE, freeze_gh); + GL_NOCACHE, &sdp->sd_freeze_gh); + if (error) + goto out; list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { error = gfs2_jdesc_check(jd); @@ -441,7 +442,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, } if (error) - gfs2_glock_dq_uninit(freeze_gh); + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); out: while (!list_empty(&list)) { @@ -553,7 +554,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) if (!(flags & I_DIRTY_INODE)) return; - if (unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags))) + if (unlikely(gfs2_withdrawn(sdp))) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); @@ -602,7 +603,7 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE, &freeze_gh); - if (error && !test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) + if (error && !gfs2_withdrawn(sdp)) return error; flush_workqueue(gfs2_delete_workqueue); @@ -761,21 +762,25 @@ static int gfs2_freeze(struct super_block *sb) if (atomic_read(&sdp->sd_freeze_state) != SFS_UNFROZEN) goto out; - if (test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) { - error = -EINVAL; - goto out; - } - for (;;) { - error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh); + if (gfs2_withdrawn(sdp)) { + error = -EINVAL; + goto out; + } + + error = gfs2_lock_fs_check_clean(sdp); if (!error) break; if (error == -EBUSY) fs_err(sdp, "waiting for recovery before freeze\n"); - else + else if (error == -EIO) { + fs_err(sdp, "Fatal IO error: cannot freeze gfs2 due " + "to recovery error.\n"); + goto out; + } else { fs_err(sdp, "error freezing FS: %d\n", error); - + } fs_err(sdp, "retrying...\n"); msleep(1000); } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index dd15b8e4af2c..8ccb68f4ed16 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -118,7 +118,7 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) { - unsigned int b = test_bit(SDF_WITHDRAWN, &sdp->sd_flags); + unsigned int b = gfs2_withdrawn(sdp); return snprintf(buf, PAGE_SIZE, "%u\n", b); } diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 35e3059255fe..9d4227330de4 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -262,6 +262,8 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) list_del_init(&bd->bd_list); gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); sdp->sd_log_num_revoke--; + if (bd->bd_gl) + gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); tr->tr_num_revoke--; if (--n == 0) diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index c45159133d8e..ec600b487498 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -258,7 +258,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line, bool withdraw) { - if (!test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) + if (!gfs2_withdrawn(sdp)) fs_err(sdp, "fatal: I/O error\n" " block = %llu\n" diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 4b68b2c1fe56..f2702bc9837c 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -164,6 +164,15 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, return x; } +/** + * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn + * @sdp: the superblock + */ +static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAWN, &sdp->sd_flags); +} + #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) diff --git a/fs/inode.c b/fs/inode.c index fef457a42882..96d62d97694e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -676,6 +676,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); +again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); @@ -698,6 +699,12 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/internal.h b/fs/internal.h index 315fcd8d237c..4a7da1df573d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -151,7 +151,6 @@ extern int invalidate_inodes(struct super_block *, bool); /* * dcache.c */ -extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); diff --git a/fs/io-wq.c b/fs/io-wq.c index 91b85df0861e..11e80b7252a8 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -49,7 +49,6 @@ struct io_worker { struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; - wait_queue_head_t wait; struct io_wqe *wqe; struct io_wq_work *cur_work; @@ -111,7 +110,7 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; - struct cred *creds; + const struct cred *creds; struct mm_struct *mm; refcount_t refs; struct completion done; @@ -258,7 +257,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) worker = hlist_nulls_entry(n, struct io_worker, nulls_node); if (io_worker_get(worker)) { - wake_up(&worker->wait); + wake_up_process(worker->task); io_worker_release(worker); return true; } @@ -492,28 +491,46 @@ next: } while (1); } +static inline void io_worker_spin_for_work(struct io_wqe *wqe) +{ + int i = 0; + + while (++i < 1000) { + if (io_wqe_run_queue(wqe)) + break; + if (need_resched()) + break; + cpu_relax(); + } +} + static int io_wqe_worker(void *data) { struct io_worker *worker = data; struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - DEFINE_WAIT(wait); + bool did_work; io_worker_start(wqe, worker); + did_work = false; while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - prepare_to_wait(&worker->wait, &wait, TASK_INTERRUPTIBLE); - + set_current_state(TASK_INTERRUPTIBLE); +loop: + if (did_work) + io_worker_spin_for_work(wqe); spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); - continue; + did_work = true; + goto loop; } + did_work = false; /* drops the lock on success, retry */ if (__io_worker_idle(wqe, worker)) { __release(&wqe->lock); - continue; + goto loop; } spin_unlock_irq(&wqe->lock); if (signal_pending(current)) @@ -526,8 +543,6 @@ static int io_wqe_worker(void *data) break; } - finish_wait(&worker->wait, &wait); - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { spin_lock_irq(&wqe->lock); if (!wq_list_empty(&wqe->work_list)) @@ -589,7 +604,6 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) refcount_set(&worker->ref, 1); worker->nulls_node.pprev = NULL; - init_waitqueue_head(&worker->wait); worker->wqe = wqe; spin_lock_init(&worker->lock); @@ -934,7 +948,7 @@ static enum io_wq_cancel io_wqe_cancel_work(struct io_wqe *wqe, /* * Now check if a free (going busy) or busy worker has the work * currently running. If we find it there, we'll return CANCEL_RUNNING - * as an indication that we attempte to signal cancellation. The + * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. */ rcu_read_lock(); diff --git a/fs/io-wq.h b/fs/io-wq.h index 600e0158cba7..3f5e356de980 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -35,7 +35,8 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { if (!list->first) { - list->first = list->last = node; + list->last = node; + WRITE_ONCE(list->first, node); } else { list->last->next = node; list->last = node; @@ -47,17 +48,18 @@ static inline void wq_node_del(struct io_wq_work_list *list, struct io_wq_work_node *prev) { if (node == list->first) - list->first = node->next; + WRITE_ONCE(list->first, node->next); if (node == list->last) list->last = prev; if (prev) prev->next = node->next; + node->next = NULL; } #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) -#define wq_list_empty(list) ((list)->first == NULL) +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) #define INIT_WQ_LIST(list) do { \ (list)->first = NULL; \ (list)->last = NULL; \ @@ -87,7 +89,7 @@ typedef void (put_work_fn)(struct io_wq_work *); struct io_wq_data { struct mm_struct *mm; struct user_struct *user; - struct cred *creds; + const struct cred *creds; get_work_fn *get_work; put_work_fn *put_work; diff --git a/fs/io_uring.c b/fs/io_uring.c index ec53aa7cdc94..6f084e3cf835 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -145,7 +145,7 @@ struct io_rings { /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure - * there are not more requests pending thatn there is space in + * there are not more requests pending than there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the @@ -238,7 +238,7 @@ struct io_ring_ctx { struct user_struct *user; - struct cred *creds; + const struct cred *creds; /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; @@ -275,7 +275,8 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct rb_root cancel_tree; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; spinlock_t inflight_lock; struct list_head inflight_list; @@ -288,11 +289,14 @@ struct io_ring_ctx { */ struct io_poll_iocb { struct file *file; - struct wait_queue_head *head; + union { + struct wait_queue_head *head; + u64 addr; + }; __poll_t events; bool done; bool canceled; - struct wait_queue_entry *wait; + struct wait_queue_entry wait; }; struct io_timeout_data { @@ -303,9 +307,57 @@ struct io_timeout_data { u32 seq_offset; }; +struct io_accept { + struct file *file; + struct sockaddr __user *addr; + int __user *addr_len; + int flags; +}; + +struct io_sync { + struct file *file; + loff_t len; + loff_t off; + int flags; +}; + +struct io_cancel { + struct file *file; + u64 addr; +}; + struct io_timeout { struct file *file; - struct io_timeout_data *data; + u64 addr; + int flags; +}; + +struct io_async_connect { + struct sockaddr_storage address; +}; + +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + struct sockaddr __user *uaddr; + struct msghdr msg; +}; + +struct io_async_rw { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + ssize_t nr_segs; + ssize_t size; +}; + +struct io_async_ctx { + struct io_uring_sqe sqe; + union { + struct io_async_rw rw; + struct io_async_msghdr msg; + struct io_async_connect connect; + struct io_timeout_data timeout; + }; }; /* @@ -319,20 +371,25 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_accept accept; + struct io_sync sync; + struct io_cancel cancel; struct io_timeout timeout; }; const struct io_uring_sqe *sqe; + struct io_async_ctx *io; struct file *ring_file; int ring_fd; bool has_user; bool in_async; bool needs_fixed_file; + u8 opcode; struct io_ring_ctx *ctx; union { struct list_head list; - struct rb_node rb_node; + struct hlist_node hash_node; }; struct list_head link_list; unsigned int flags; @@ -353,7 +410,8 @@ struct io_kiocb { #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */ +#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ +#define REQ_F_PREPPED 131072 /* request already opcode prepared */ u64 user_data; u32 result; u32 sequence; @@ -422,6 +480,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -435,6 +494,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx->completions) goto err; + /* + * Use 5 bits less than the max cq entries, that should give us around + * 32 entries per hash list if totally full and uniformly spread. + */ + hash_bits = ilog2(p->cq_entries); + hash_bits -= 5; + if (hash_bits <= 0) + hash_bits = 1; + ctx->cancel_hash_bits = hash_bits; + ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), + GFP_KERNEL); + if (!ctx->cancel_hash) + goto err; + __hash_init(ctx->cancel_hash, 1U << hash_bits); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -448,7 +522,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - ctx->cancel_tree = RB_ROOT; INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -459,6 +532,7 @@ err: if (ctx->fallback_req) kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx->completions); + kfree(ctx->cancel_hash); kfree(ctx); return NULL; } @@ -524,12 +598,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +static inline bool io_req_needs_user(struct io_kiocb *req) { - u8 opcode = READ_ONCE(sqe->opcode); - - return !(opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED); + return !(req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED); } static inline bool io_prep_async_work(struct io_kiocb *req, @@ -538,10 +610,12 @@ static inline bool io_prep_async_work(struct io_kiocb *req, bool do_hashed = false; if (req->sqe) { - switch (req->sqe->opcode) { + switch (req->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - do_hashed = true; + /* only regular files should be hashed for writes */ + if (req->flags & REQ_F_ISREG) + do_hashed = true; /* fall-through */ case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -559,7 +633,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; break; } - if (io_sqe_needs_user(req->sqe)) + if (io_req_needs_user(req)) req->work.flags |= IO_WQ_WORK_NEEDS_USER; } @@ -592,7 +666,7 @@ static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); @@ -806,6 +880,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, } got_it: + req->io = NULL; req->ring_file = NULL; req->file = NULL; req->ctx = ctx; @@ -836,8 +911,8 @@ static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_FREE_SQE) - kfree(req->sqe); + if (req->io) + kfree(req->io); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -849,8 +924,6 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - if (req->flags & REQ_F_TIMEOUT) - kfree(req->timeout.data); percpu_ref_put(&ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -863,7 +936,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -878,7 +951,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt; bool wake_ev = false; /* Already got next link */ @@ -890,24 +962,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * potentially happen if the chain is messed up, check to be on the * safe side. */ - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - while (nxt) { - list_del_init(&nxt->list); + while (!list_empty(&req->link_list)) { + struct io_kiocb *nxt = list_first_entry(&req->link_list, + struct io_kiocb, link_list); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT)) { + if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT))) { + list_del_init(&nxt->link_list); wake_ev |= io_link_cancel_timeout(nxt); - nxt = list_first_entry_or_null(&req->link_list, - struct io_kiocb, list); req->flags &= ~REQ_F_LINK_TIMEOUT; continue; } - if (!list_empty(&req->link_list)) { - INIT_LIST_HEAD(&nxt->link_list); - list_splice(&req->link_list, &nxt->link_list); - nxt->flags |= REQ_F_LINK; - } + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK; *nxtptr = nxt; break; } @@ -923,19 +992,19 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { - link = list_first_entry(&req->link_list, struct io_kiocb, list); - list_del_init(&link->list); + struct io_kiocb *link = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { + link->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); @@ -1079,9 +1148,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if (((req->flags & - (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && + !req->io) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -1141,7 +1210,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, } /* - * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a + * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a * non-spinning poll check - we'll still enter the driver poll loop, but only * as a non-spinning completion check. */ @@ -1258,6 +1327,12 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); @@ -1265,8 +1340,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); io_cqring_add_event(req, res); } @@ -1296,8 +1371,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); req->result = res; if (res != -EAGAIN) req->flags |= REQ_F_IOPOLL_COMPLETED; @@ -1388,7 +1463,7 @@ static bool io_file_supports_async(struct file *file) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode)) + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) return true; if (S_ISREG(mode) && file->f_op != &io_uring_fops) return true; @@ -1410,15 +1485,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) if (S_ISREG(file_inode(req->file)->i_mode)) req->flags |= REQ_F_ISREG; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; - } - kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1581,12 +1647,22 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, * for that purpose and instead let the caller pass in the read/write * flag. */ - opcode = READ_ONCE(sqe->opcode); + opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; return io_import_fixed(req->ctx, rw, sqe, iter); } + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = iorw->iov; + iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); + if (iorw->iov == iorw->fast_iov) + *iovec = NULL; + return iorw->size; + } + if (!req->has_user) return -EFAULT; @@ -1657,6 +1733,70 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } +static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io->rw.nr_segs = iter->nr_segs; + req->io->rw.size = io_size; + req->io->rw.iov = iovec; + if (!req->io->rw.iov) { + req->io->rw.iov = req->io->rw.fast_iov; + memcpy(req->io->rw.iov, fast_iov, + sizeof(struct iovec) * iter->nr_segs); + } +} + +static int io_alloc_async_ctx(struct io_kiocb *req) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + if (req->io) { + memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); + req->sqe = &req->io->sqe; + return 0; + } + + return 1; +} + +static void io_rw_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; + + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->rw.iov; + io_wq_submit_work(workptr); + kfree(iov); +} + +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + if (!req->io && io_alloc_async_ctx(req)) + return -ENOMEM; + + io_req_map_rw(req, io_size, iovec, fast_iov, iter); + req->work.func = io_rw_async; + return 0; +} + +static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + + return io_import_iovec(READ, req, iovec, iter); +} + static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1665,23 +1805,35 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t read_size, ret; + ssize_t io_size, ret; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; - file = kiocb->ki_filp; - - if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; + if (!req->io) { + ret = io_read_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(READ, req, &iovec, &iter); + if (ret < 0) + return ret; + } - ret = io_import_iovec(READ, req, &iovec, &iter); - if (ret < 0) - return ret; + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.ki_flags &= ~IOCB_NOWAIT; - read_size = ret; + file = req->file; + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = read_size; + req->result = io_size; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -1703,18 +1855,41 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, */ if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && (req->flags & REQ_F_ISREG) && - ret2 > 0 && ret2 < read_size) + ret2 > 0 && ret2 < io_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) + if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); - else - ret = -EAGAIN; + } else { +copy_iov: + ret = io_setup_async_rw(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } - kfree(iovec); +out_free: + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } +static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; + + return io_import_iovec(WRITE, req, iovec, iter); +} + static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1723,29 +1898,42 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; - - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + ssize_t ret, io_size; - file = kiocb->ki_filp; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; + if (!req->io) { + ret = io_write_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(WRITE, req, &iovec, &iter); + if (ret < 0) + return ret; + } - ret = io_import_iovec(WRITE, req, &iovec, &iter); - if (ret < 0) - return ret; + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.ki_flags &= ~IOCB_NOWAIT; + file = kiocb->ki_filp; + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = ret; + req->result = io_size; - iov_count = iov_iter_count(&iter); + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } - ret = -EAGAIN; - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) - goto out_free; + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) + goto copy_iov; + iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1769,13 +1957,20 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = call_write_iter(file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); - if (!force_nonblock || ret2 != -EAGAIN) + if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); - else - ret = -EAGAIN; + } else { +copy_iov: + ret = io_setup_async_rw(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } out_free: - kfree(iovec); + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } @@ -1794,10 +1989,13 @@ static int io_nop(struct io_kiocb *req) return 0; } -static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_fsync(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; + if (req->flags & REQ_F_PREPPED) + return 0; if (!req->file) return -EBADF; @@ -1806,46 +2004,80 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; + req->sync.flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); + req->flags |= REQ_F_PREPPED; return 0; } -static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static bool io_req_cancelled(struct io_kiocb *req) +{ + if (req->work.flags & IO_WQ_WORK_CANCEL) { + req_set_fail_links(req); + io_cqring_add_event(req, -ECANCELED); + io_put_req(req); + return true; + } + + return false; +} + +static void io_fsync_finish(struct io_wq_work **workptr) { - loff_t sqe_off = READ_ONCE(sqe->off); - loff_t sqe_len = READ_ONCE(sqe->len); - loff_t end = sqe_off + sqe_len; - unsigned fsync_flags; + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + loff_t end = req->sync.off + req->sync.len; + struct io_kiocb *nxt = NULL; int ret; - fsync_flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) - return -EINVAL; + if (io_req_cancelled(req)) + return; + + ret = vfs_fsync_range(req->rw.ki_filp, req->sync.off, + end > 0 ? end : LLONG_MAX, + req->sync.flags & IORING_FSYNC_DATASYNC); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + *workptr = &nxt->work; +} - ret = io_prep_fsync(req, sqe); +static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + int ret; + + ret = io_prep_fsync(req); if (ret) return ret; /* fsync always requires a blocking context */ - if (force_nonblock) + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fsync_finish; return -EAGAIN; + } - ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, - end > 0 ? end : LLONG_MAX, - fsync_flags & IORING_FSYNC_DATASYNC); - - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + work = old_work = &req->work; + io_fsync_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); return 0; } -static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_sfr(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - int ret = 0; + if (req->flags & REQ_F_PREPPED) + return 0; if (!req->file) return -EBADF; @@ -1854,46 +2086,91 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - return ret; + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); + req->sync.flags = READ_ONCE(sqe->sync_range_flags); + req->flags |= REQ_F_PREPPED; + return 0; +} + +static void io_sync_file_range_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + if (io_req_cancelled(req)) + return; + + ret = sync_file_range(req->rw.ki_filp, req->sync.off, req->sync.len, + req->sync.flags); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + *workptr = &nxt->work; } -static int io_sync_file_range(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, +static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - loff_t sqe_off; - loff_t sqe_len; - unsigned flags; + struct io_wq_work *work, *old_work; int ret; - ret = io_prep_sfr(req, sqe); + ret = io_prep_sfr(req); if (ret) return ret; /* sync_file_range always requires a blocking context */ - if (force_nonblock) + if (force_nonblock) { + io_put_req(req); + req->work.func = io_sync_file_range_finish; return -EAGAIN; + } + + work = old_work = &req->work; + io_sync_file_range_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; +} - sqe_off = READ_ONCE(sqe->off); - sqe_len = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->sync_range_flags); +#if defined(CONFIG_NET) +static void io_sendrecv_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; - ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->msg.iov; + io_wq_submit_work(workptr); + kfree(iov); +} +#endif - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); +static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + io->msg.iov = io->msg.fast_iov; + return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); +#else return 0; +#endif } -#if defined(CONFIG_NET) -static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock, - long (*fn)(struct socket *, struct user_msghdr __user *, - unsigned int)) +static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -1902,7 +2179,8 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; + struct io_async_ctx io; + struct sockaddr_storage addr; unsigned flags; flags = READ_ONCE(sqe->msg_flags); @@ -1911,85 +2189,238 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, else if (force_nonblock) flags |= MSG_DONTWAIT; - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; + ret = io_sendmsg_prep(req, &io); + if (ret) + goto out; + } - ret = fn(sock, msg, flags); - if (force_nonblock && ret == -EAGAIN) - return ret; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + if (force_nonblock && ret == -EAGAIN) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; } +out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); return 0; -} +#else + return -EOPNOTSUPP; #endif +} -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_sendmsg_sock); + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + io->msg.iov = io->msg.fast_iov; + return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, + &io->msg.iov); #else - return -EOPNOTSUPP; + return 0; #endif } -static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_recvmsg_sock); + const struct io_uring_sqe *sqe = req->sqe; + struct io_async_msghdr *kmsg = NULL; + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct user_msghdr __user *msg; + struct io_async_ctx io; + struct sockaddr_storage addr; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + msg = (struct user_msghdr __user *) (unsigned long) + READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; + ret = io_recvmsg_prep(req, &io); + if (ret) + goto out; + } + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, msg, kmsg->uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + +out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; #else return -EOPNOTSUPP; #endif } -static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_accept_prep(struct io_kiocb *req) { #if defined(CONFIG_NET) - struct sockaddr __user *addr; - int __user *addr_len; - unsigned file_flags; - int flags, ret; + const struct io_uring_sqe *sqe = req->sqe; + struct io_accept *accept = &req->accept; + + if (req->flags & REQ_F_PREPPED) + return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); - addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); - flags = READ_ONCE(sqe->accept_flags); - file_flags = force_nonblock ? O_NONBLOCK : 0; + accept->addr = (struct sockaddr __user *) + (unsigned long) READ_ONCE(sqe->addr); + accept->addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); + accept->flags = READ_ONCE(sqe->accept_flags); + req->flags |= REQ_F_PREPPED; + return 0; +#else + return -EOPNOTSUPP; +#endif +} - ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags); - if (ret == -EAGAIN && force_nonblock) { - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; +#if defined(CONFIG_NET) +static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_accept *accept = &req->accept; + unsigned file_flags; + int ret; + + file_flags = force_nonblock ? O_NONBLOCK : 0; + ret = __sys_accept4_file(req->file, file_flags, accept->addr, + accept->addr_len, accept->flags); + if (ret == -EAGAIN && force_nonblock) return -EAGAIN; - } if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; +} + +static void io_accept_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + if (io_req_cancelled(req)) + return; + __io_accept(req, &nxt, false); + if (nxt) + *workptr = &nxt->work; +} +#endif + +static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + int ret; + + ret = io_accept_prep(req); + if (ret) + return ret; + + ret = __io_accept(req, nxt, force_nonblock); + if (ret == -EAGAIN && force_nonblock) { + req->work.func = io_accept_finish; + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + io_put_req(req); + return -EAGAIN; + } + return 0; #else return -EOPNOTSUPP; #endif } -static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; struct sockaddr __user *addr; + int addr_len; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = READ_ONCE(sqe->addr2); + return move_addr_to_kernel(addr, addr_len, &io->connect.address); +#else + return 0; +#endif +} + +static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct io_async_ctx __io, *io; unsigned file_flags; int addr_len, ret; @@ -1998,17 +2429,35 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); addr_len = READ_ONCE(sqe->addr2); file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, addr, addr_len, file_flags); - if (ret == -EAGAIN && force_nonblock) + if (req->io) { + io = req->io; + } else { + ret = io_connect_prep(req, &__io); + if (ret) + goto out; + io = &__io; + } + + ret = __sys_connect_file(req->file, &io->connect.address, addr_len, + file_flags); + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) { + ret = -ENOMEM; + goto out; + } + memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); return -EAGAIN; + } if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; +out: + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; @@ -2017,55 +2466,45 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } -static inline void io_poll_remove_req(struct io_kiocb *req) -{ - if (!RB_EMPTY_NODE(&req->rb_node)) { - rb_erase(&req->rb_node, &req->ctx->cancel_tree); - RB_CLEAR_NODE(&req->rb_node); - } -} - static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait->entry)) { - list_del_init(&poll->wait->entry); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); io_queue_async_work(req); } spin_unlock(&poll->head->lock); - io_poll_remove_req(req); + hash_del(&req->hash_node); } static void io_poll_remove_all(struct io_ring_ctx *ctx) { - struct rb_node *node; + struct hlist_node *tmp; struct io_kiocb *req; + int i; spin_lock_irq(&ctx->completion_lock); - while ((node = rb_first(&ctx->cancel_tree)) != NULL) { - req = rb_entry(node, struct io_kiocb, rb_node); - io_poll_remove_one(req); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) + io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) { - struct rb_node *p, *parent = NULL; + struct hlist_head *list; struct io_kiocb *req; - p = ctx->cancel_tree.rb_node; - while (p) { - parent = p; - req = rb_entry(parent, struct io_kiocb, rb_node); - if (sqe_addr < req->user_data) { - p = p->rb_left; - } else if (sqe_addr > req->user_data) { - p = p->rb_right; - } else { + list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (sqe_addr == req->user_data) { io_poll_remove_one(req); return 0; } @@ -2074,28 +2513,45 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) return -ENOENT; } +static int io_poll_remove_prep(struct io_kiocb *req) +{ + const struct io_uring_sqe *sqe = req->sqe; + + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + req->poll.addr = READ_ONCE(sqe->addr); + req->flags |= REQ_F_PREPPED; + return 0; +} + /* * Find a running poll command that matches one specified in sqe->addr, * and remove it if found. */ -static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + u64 addr; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->poll_events) - return -EINVAL; + ret = io_poll_remove_prep(req); + if (ret) + return ret; + addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } @@ -2105,7 +2561,6 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - kfree(req->poll.wait); if (error) io_cqring_fill_event(req, error); else @@ -2143,18 +2598,18 @@ static void io_poll_complete_work(struct io_wq_work **workptr) */ spin_lock_irq(&ctx->completion_lock); if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, poll->wait); + add_wait_queue(poll->head, &poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } - io_poll_remove_req(req); + hash_del(&req->hash_node); io_poll_complete(req, mask, ret); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, &nxt); if (nxt) *workptr = &nxt->work; @@ -2173,7 +2628,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); /* * Run completion inline if we can. We're using trylock here because @@ -2182,7 +2637,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * for finalizing the request, mark us as having grabbed that already. */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - io_poll_remove_req(req); + hash_del(&req->hash_node); io_poll_complete(req, mask, 0); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -2214,38 +2669,26 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, pt->req->poll.wait); + add_wait_queue(head, &pt->req->poll.wait); } static void io_poll_req_insert(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct rb_node **p = &ctx->cancel_tree.rb_node; - struct rb_node *parent = NULL; - struct io_kiocb *tmp; - - while (*p) { - parent = *p; - tmp = rb_entry(parent, struct io_kiocb, rb_node); - if (req->user_data < tmp->user_data) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&req->rb_node, parent, p); - rb_insert_color(&req->rb_node, &ctx->cancel_tree); + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); } -static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_poll_add_prep(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; - struct io_poll_table ipt; - bool cancel = false; - __poll_t mask; u16 events; + if (req->flags & REQ_F_PREPPED) + return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) @@ -2253,15 +2696,27 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); - if (!poll->wait) - return -ENOMEM; - - req->sqe = NULL; - INIT_IO_WORK(&req->work, io_poll_complete_work); + req->flags |= REQ_F_PREPPED; events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - RB_CLEAR_NODE(&req->rb_node); + return 0; +} + +static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_poll_iocb *poll = &req->poll; + struct io_ring_ctx *ctx = req->ctx; + struct io_poll_table ipt; + bool cancel = false; + __poll_t mask; + int ret; + + ret = io_poll_add_prep(req); + if (ret) + return ret; + + INIT_IO_WORK(&req->work, io_poll_complete_work); + INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; poll->done = false; @@ -2273,9 +2728,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait->entry); - init_waitqueue_func_entry(poll->wait, io_poll_wake); - poll->wait->private = poll; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); + poll->wait.private = poll; INIT_LIST_HEAD(&req->list); @@ -2284,14 +2739,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait->entry))) { + if (unlikely(list_empty(&poll->wait.entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ @@ -2331,7 +2786,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) /* * Adjust the reqs sequence before the current one because it - * will consume a slot in the cq_ring and the the cq_tail + * will consume a slot in the cq_ring and the cq_tail * pointer will be increased, otherwise other timeout reqs may * return in advance without waiting for enough wait_nr. */ @@ -2346,8 +2801,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); return HRTIMER_NORESTART; } @@ -2368,49 +2822,63 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret == -1) return -EALREADY; - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; } +static int io_timeout_remove_prep(struct io_kiocb *req) +{ + const struct io_uring_sqe *sqe = req->sqe; + + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + return -EINVAL; + + req->timeout.addr = READ_ONCE(sqe->addr); + req->timeout.flags = READ_ONCE(sqe->timeout_flags); + if (req->timeout.flags) + return -EINVAL; + + req->flags |= REQ_F_PREPPED; + return 0; +} + /* * Remove or update an existing timeout command */ -static int io_timeout_remove(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_timeout_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; int ret; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags) - return -EINVAL; + ret = io_timeout_remove_prep(req); + if (ret) + return ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_timeout_cancel(ctx, req->timeout.addr); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } -static int io_timeout_setup(struct io_kiocb *req) +static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, + bool is_timeout_link) { const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; @@ -2420,15 +2888,14 @@ static int io_timeout_setup(struct io_kiocb *req) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) return -EINVAL; + if (sqe->off && is_timeout_link) + return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL); - if (!data) - return -ENOMEM; + data = &io->timeout; data->req = req; - req->timeout.data = data; req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) @@ -2443,8 +2910,9 @@ static int io_timeout_setup(struct io_kiocb *req) return 0; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; @@ -2452,12 +2920,14 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned span = 0; int ret; - ret = io_timeout_setup(req); - /* common setup allows flags (like links) set, we don't */ - if (!ret && sqe->flags) - ret = -EINVAL; - if (ret) - return ret; + if (!req->io) { + if (io_alloc_async_ctx(req)) + return -ENOMEM; + ret = io_timeout_prep(req, req->io, false); + if (ret) + return ret; + } + data = &req->io->timeout; /* * sqe->off holds how many events that need to occur for this @@ -2473,7 +2943,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->sequence = ctx->cached_sq_head + count - 1; - req->timeout.data->seq_offset = count; + data->seq_offset = count; /* * Insertion sort, ensuring the first entry in the list is always @@ -2484,7 +2954,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); unsigned nxt_sq_head; long long tmp, tmp_nxt; - u32 nxt_offset = nxt->timeout.data->seq_offset; + u32 nxt_offset = nxt->io->timeout.seq_offset; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; @@ -2517,7 +2987,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->sequence -= span; add: list_add(&req->list, entry); - data = req->timeout.data; data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); @@ -2578,50 +3047,141 @@ done: spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); } -static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_async_cancel_prep(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + const struct io_uring_sqe *sqe = req->sqe; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); + req->flags |= REQ_F_PREPPED; + req->cancel.addr = READ_ONCE(sqe->addr); + return 0; +} + +static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_async_cancel_prep(req); + if (ret) + return ret; + + io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); return 0; } +static int io_req_defer_prep(struct io_kiocb *req) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_async_ctx *io = req->io; + struct iov_iter iter; + ssize_t ret = 0; + + switch (req->opcode) { + case IORING_OP_NOP: + break; + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + /* ensure prep does right import */ + req->io = NULL; + ret = io_read_prep(req, &iovec, &iter, true); + req->io = io; + if (ret < 0) + break; + io_req_map_rw(req, ret, iovec, inline_vecs, &iter); + ret = 0; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + /* ensure prep does right import */ + req->io = NULL; + ret = io_write_prep(req, &iovec, &iter, true); + req->io = io; + if (ret < 0) + break; + io_req_map_rw(req, ret, iovec, inline_vecs, &iter); + ret = 0; + break; + case IORING_OP_POLL_ADD: + ret = io_poll_add_prep(req); + break; + case IORING_OP_POLL_REMOVE: + ret = io_poll_remove_prep(req); + break; + case IORING_OP_FSYNC: + ret = io_prep_fsync(req); + break; + case IORING_OP_SYNC_FILE_RANGE: + ret = io_prep_sfr(req); + break; + case IORING_OP_SENDMSG: + ret = io_sendmsg_prep(req, io); + break; + case IORING_OP_RECVMSG: + ret = io_recvmsg_prep(req, io); + break; + case IORING_OP_CONNECT: + ret = io_connect_prep(req, io); + break; + case IORING_OP_TIMEOUT: + ret = io_timeout_prep(req, io, false); + break; + case IORING_OP_TIMEOUT_REMOVE: + ret = io_timeout_remove_prep(req); + break; + case IORING_OP_ASYNC_CANCEL: + ret = io_async_cancel_prep(req); + break; + case IORING_OP_LINK_TIMEOUT: + ret = io_timeout_prep(req, io, true); + break; + case IORING_OP_ACCEPT: + ret = io_accept_prep(req); + break; + default: + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", + req->opcode); + ret = -EINVAL; + break; + } + + return ret; +} + static int io_req_defer(struct io_kiocb *req) { - struct io_uring_sqe *sqe_copy; struct io_ring_ctx *ctx = req->ctx; + int ret; /* Still need defer if there is pending req in defer list. */ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) + if (io_alloc_async_ctx(req)) return -EAGAIN; + ret = io_req_defer_prep(req); + if (ret < 0) + return ret; + spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - kfree(sqe_copy); return 0; } - memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy)); - req->flags |= REQ_F_FREE_SQE; - req->sqe = sqe_copy; - trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); @@ -2632,11 +3192,10 @@ __attribute__((nonnull)) static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - int ret, opcode; struct io_ring_ctx *ctx = req->ctx; + int ret; - opcode = READ_ONCE(req->sqe->opcode); - switch (opcode) { + switch (req->opcode) { case IORING_OP_NOP: ret = io_nop(req); break; @@ -2657,37 +3216,37 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, req->sqe, nxt, force_nonblock); + ret = io_fsync(req, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, req->sqe, nxt); + ret = io_poll_add(req, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, req->sqe); + ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); + ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); + ret = io_sendmsg(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); + ret = io_recvmsg(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, req->sqe); + ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, req->sqe); + ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: - ret = io_accept(req, req->sqe, nxt, force_nonblock); + ret = io_accept(req, nxt, force_nonblock); break; case IORING_OP_CONNECT: - ret = io_connect(req, req->sqe, nxt, force_nonblock); + ret = io_connect(req, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, req->sqe, nxt); + ret = io_async_cancel(req, nxt); break; default: ret = -EINVAL; @@ -2701,12 +3260,7 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, if (req->result == -EAGAIN) return -EAGAIN; - /* workqueue context doesn't hold uring_lock, grab it now */ - if (req->in_async) - mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (req->in_async) - mutex_unlock(&ctx->uring_lock); } return 0; @@ -2728,9 +3282,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - /* Ensure we clear previously set non-block flag */ - req->rw.ki_flags &= ~IOCB_NOWAIT; - if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; @@ -2754,8 +3305,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req); if (ret) { - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); } @@ -2774,20 +3324,25 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } } -static bool io_op_needs_file(const struct io_uring_sqe *sqe) +static bool io_req_op_valid(int op) { - int op = READ_ONCE(sqe->opcode); + return op >= IORING_OP_NOP && op < IORING_OP_LAST; +} - switch (op) { +static int io_req_needs_file(struct io_kiocb *req) +{ + switch (req->opcode) { case IORING_OP_NOP: case IORING_OP_POLL_REMOVE: case IORING_OP_TIMEOUT: case IORING_OP_TIMEOUT_REMOVE: case IORING_OP_ASYNC_CANCEL: case IORING_OP_LINK_TIMEOUT: - return false; + return 0; default: - return true; + if (io_req_op_valid(req->opcode)) + return 1; + return -EINVAL; } } @@ -2804,7 +3359,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; unsigned flags; - int fd; + int fd, ret; flags = READ_ONCE(req->sqe->flags); fd = READ_ONCE(req->sqe->fd); @@ -2812,8 +3367,9 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - if (!io_op_needs_file(req->sqe)) - return 0; + ret = io_req_needs_file(req); + if (ret <= 0) + return ret; if (flags & IOSQE_FIXED_FILE) { if (unlikely(!ctx->file_table || @@ -2876,10 +3432,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (!list_empty(&req->list)) { - prev = list_entry(req->list.prev, struct io_kiocb, link_list); + if (!list_empty(&req->link_list)) { + prev = list_entry(req->link_list.prev, struct io_kiocb, + link_list); if (refcount_inc_not_zero(&prev->refs)) { - list_del_init(&req->list); + list_del_init(&req->link_list); prev->flags &= ~REQ_F_LINK_TIMEOUT; } else prev = NULL; @@ -2888,8 +3445,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - if (prev->flags & REQ_F_LINK) - prev->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, NULL, -ETIME); io_put_req(prev); @@ -2909,8 +3465,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req) * we got a chance to setup the timer */ spin_lock_irq(&ctx->completion_lock); - if (!list_empty(&req->list)) { - struct io_timeout_data *data = req->timeout.data; + if (!list_empty(&req->link_list)) { + struct io_timeout_data *data = &req->io->timeout; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), @@ -2929,8 +3485,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK)) return NULL; - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, + link_list); + if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; req->flags |= REQ_F_LINK_TIMEOUT; @@ -2939,13 +3496,14 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; int ret; +again: + linked_timeout = io_prep_linked_timeout(req); + ret = io_issue_sqe(req, &nxt, true); - if (nxt) - io_queue_async_work(nxt); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2953,15 +3511,6 @@ static void __io_queue_sqe(struct io_kiocb *req) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - struct io_uring_sqe *sqe_copy; - - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) - goto err; - - req->sqe = sqe_copy; - req->flags |= REQ_F_FREE_SQE; - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { ret = io_grab_files(req); if (ret) @@ -2973,7 +3522,7 @@ static void __io_queue_sqe(struct io_kiocb *req) * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); - return; + goto done_req; } err: @@ -2990,10 +3539,15 @@ err: /* and drop final reference, if we failed */ if (ret) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); } +done_req: + if (nxt) { + req = nxt; + nxt = NULL; + goto again; + } } static void io_queue_sqe(struct io_kiocb *req) @@ -3010,8 +3564,7 @@ static void io_queue_sqe(struct io_kiocb *req) if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_double_put_req(req); } } else @@ -3027,17 +3580,15 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_queue_sqe(req); } +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK) -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) - -static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, +static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; int ret; - req->user_data = req->sqe->user_data; - /* enforce forwards compatibility on users */ if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; @@ -3049,7 +3600,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); - return; + return false; } /* @@ -3061,40 +3612,38 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; - struct io_uring_sqe *sqe_copy; if (req->sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { - ret = io_timeout_setup(req); - /* common setup allows offset being set, we don't */ - if (!ret && req->sqe->off) - ret = -EINVAL; - if (ret) { - prev->flags |= REQ_F_FAIL_LINK; - goto err_req; - } - } + if (req->sqe->flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) { + if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; } - req->sqe = sqe_copy; - req->flags |= REQ_F_FREE_SQE; + ret = io_req_defer_prep(req); + if (ret) { + /* fail even hard links since we don't submit */ + prev->flags |= REQ_F_FAIL_LINK; + goto err_req; + } trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->list, &prev->link_list); - } else if (req->sqe->flags & IOSQE_IO_LINK) { + list_add_tail(&req->link_list, &prev->link_list); + } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; + if (req->sqe->flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; INIT_LIST_HEAD(&req->link_list); *link = req; } else { io_queue_sqe(req); } + + return true; } /* @@ -3113,7 +3662,7 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - struct io_ring_ctx *ctx, unsigned max_ios) + unsigned int max_ios) { blk_start_plug(&state->plug); state->free_reqs = 0; @@ -3136,7 +3685,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } /* - * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * Fetch an sqe, if one is available. Note that req->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later @@ -3171,6 +3720,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) */ req->sequence = ctx->cached_sq_head; req->sqe = &ctx->sq_sqes[head]; + req->opcode = READ_ONCE(req->sqe->opcode); + req->user_data = READ_ONCE(req->sqe->user_data); ctx->cached_sq_head++; return true; } @@ -3197,7 +3748,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, return -EBUSY; if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, ctx, nr); + io_submit_state_start(&state, nr); statep = &state; } @@ -3216,7 +3767,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } - if (io_sqe_needs_user(req->sqe) && !*mm) { + if (io_req_needs_user(req) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3224,6 +3775,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } + submitted++; sqe_flags = req->sqe->flags; req->ring_file = ring_file; @@ -3231,16 +3783,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->sqe->user_data, - true, async); - io_submit_sqe(req, statep, &link); - submitted++; - + trace_io_uring_submit_sqe(ctx, req->user_data, true, async); + if (!io_submit_sqe(req, statep, &link)) + break; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ - if (!(sqe_flags & IOSQE_IO_LINK) && link) { + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { io_queue_link_head(link); link = NULL; } @@ -3369,7 +3919,9 @@ static int io_sq_thread(void *data) } to_submit = min(to_submit, ctx->sq_entries); + mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + mutex_unlock(&ctx->uring_lock); if (ret > 0) inflight += ret; } @@ -3398,7 +3950,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) struct io_ring_ctx *ctx = iowq->ctx; /* - * Wake up if we have enough events, or if a timeout occured since we + * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ @@ -4363,6 +4915,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->completions); + kfree(ctx->cancel_hash); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); } @@ -4587,6 +5140,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, submitted = io_submit_sqes(ctx, to_submit, f.file, fd, &cur_mm, false); mutex_unlock(&ctx->uring_lock); + + if (submitted != to_submit) + goto out; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; @@ -4600,6 +5156,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } +out: percpu_ref_put(&ctx->refs); out_fput: fdput(f); @@ -4759,7 +5316,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; - ctx->creds = prepare_creds(); + ctx->creds = get_current_cred(); ret = io_allocate_scq_urings(ctx, p); if (ret) @@ -4794,7 +5351,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP; + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d33c7bc5ee92..828444e14d09 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -28,6 +28,7 @@ struct iomap_page { atomic_t read_count; atomic_t write_count; + spinlock_t uptodate_lock; DECLARE_BITMAP(uptodate, PAGE_SIZE / 512); }; @@ -51,6 +52,7 @@ iomap_page_create(struct inode *inode, struct page *page) iop = kmalloc(sizeof(*iop), GFP_NOFS | __GFP_NOFAIL); atomic_set(&iop->read_count, 0); atomic_set(&iop->write_count, 0); + spin_lock_init(&iop->uptodate_lock); bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE); /* @@ -139,25 +141,38 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, } static void -iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) +iomap_iop_set_range_uptodate(struct page *page, unsigned off, unsigned len) { struct iomap_page *iop = to_iomap_page(page); struct inode *inode = page->mapping->host; unsigned first = off >> inode->i_blkbits; unsigned last = (off + len - 1) >> inode->i_blkbits; - unsigned int i; bool uptodate = true; + unsigned long flags; + unsigned int i; - if (iop) { - for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { - if (i >= first && i <= last) - set_bit(i, iop->uptodate); - else if (!test_bit(i, iop->uptodate)) - uptodate = false; - } + spin_lock_irqsave(&iop->uptodate_lock, flags); + for (i = 0; i < PAGE_SIZE / i_blocksize(inode); i++) { + if (i >= first && i <= last) + set_bit(i, iop->uptodate); + else if (!test_bit(i, iop->uptodate)) + uptodate = false; } - if (uptodate && !PageError(page)) + if (uptodate) + SetPageUptodate(page); + spin_unlock_irqrestore(&iop->uptodate_lock, flags); +} + +static void +iomap_set_range_uptodate(struct page *page, unsigned off, unsigned len) +{ + if (PageError(page)) + return; + + if (page_has_private(page)) + iomap_iop_set_range_uptodate(page, off, len); + else SetPageUptodate(page); } @@ -1128,6 +1143,7 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) struct bio *bio = &ioend->io_inline_bio; struct bio *last = ioend->io_bio, *next; u64 start = bio->bi_iter.bi_sector; + loff_t offset = ioend->io_offset; bool quiet = bio_flagged(bio, BIO_QUIET); for (bio = &ioend->io_inline_bio; bio; bio = next) { @@ -1148,12 +1164,12 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) iomap_finish_page_writeback(inode, bv->bv_page, error); bio_put(bio); } + /* The ioend has been freed by bio_put() */ if (unlikely(error && !quiet)) { printk_ratelimited(KERN_ERR "%s: writeback error on inode %lu, offset %lld, sector %llu", - inode->i_sb->s_id, inode->i_ino, ioend->io_offset, - start); + inode->i_sb->s_id, inode->i_ino, offset, start); } } diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 4d31503abaee..9dc7e7a64e10 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -223,7 +223,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, dput(dentry); return ERR_PTR(-EINVAL); } - dtmp = lookup_one_len_unlocked(kntmp->name, dentry, + dtmp = lookup_positive_unlocked(kntmp->name, dentry, strlen(kntmp->name)); dput(dentry); if (IS_ERR(dtmp)) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 7d46fafdbbe5..0afb6d59bad0 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -464,7 +464,8 @@ nlm_bind_host(struct nlm_host *host) .version = host->h_version, .authflavor = RPC_AUTH_UNIX, .flags = (RPC_CLNT_CREATE_NOPING | - RPC_CLNT_CREATE_AUTOBIND), + RPC_CLNT_CREATE_AUTOBIND | + RPC_CLNT_CREATE_REUSEPORT), .cred = host->h_cred, }; diff --git a/fs/namei.c b/fs/namei.c index 2dda552bcf7a..d6c91d1e88cb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1210,25 +1210,25 @@ static int follow_automount(struct path *path, struct nameidata *nd, * - Flagged as automount point * * This may only be called in refwalk mode. + * On success path->dentry is known positive. * * Serialization is taken care of in namespace.c */ static int follow_managed(struct path *path, struct nameidata *nd) { struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ - unsigned managed; + unsigned flags; bool need_mntput = false; int ret = 0; /* Given that we're not holding a lock here, we retain the value in a * local variable for each dentry as we look at it so that we don't see * the components of that value change under us */ - while (managed = READ_ONCE(path->dentry->d_flags), - managed &= DCACHE_MANAGED_DENTRY, - unlikely(managed != 0)) { + while (flags = smp_load_acquire(&path->dentry->d_flags), + unlikely(flags & DCACHE_MANAGED_DENTRY)) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ - if (managed & DCACHE_MANAGE_TRANSIT) { + if (flags & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); ret = path->dentry->d_op->d_manage(path, false); @@ -1237,7 +1237,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) } /* Transit to a mounted filesystem. */ - if (managed & DCACHE_MOUNTED) { + if (flags & DCACHE_MOUNTED) { struct vfsmount *mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); @@ -1256,7 +1256,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) } /* Handle an automount point */ - if (managed & DCACHE_NEED_AUTOMOUNT) { + if (flags & DCACHE_NEED_AUTOMOUNT) { ret = follow_automount(path, nd, &need_mntput); if (ret < 0) break; @@ -1269,10 +1269,12 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (need_mntput && path->mnt == mnt) mntput(path->mnt); - if (ret == -EISDIR || !ret) - ret = 1; if (need_mntput) nd->flags |= LOOKUP_JUMPED; + if (ret == -EISDIR || !ret) + ret = 1; + if (ret > 0 && unlikely(d_flags_negative(flags))) + ret = -ENOENT; if (unlikely(ret < 0)) path_put_conditional(path, nd); return ret; @@ -1621,10 +1623,6 @@ static int lookup_fast(struct nameidata *nd, dput(dentry); return status; } - if (unlikely(d_is_negative(dentry))) { - dput(dentry); - return -ENOENT; - } path->mnt = mnt; path->dentry = dentry; @@ -1811,11 +1809,6 @@ static int walk_component(struct nameidata *nd, int flags) if (unlikely(err < 0)) return err; - if (unlikely(d_is_negative(path.dentry))) { - path_to_nameidata(&path, nd); - return -ENOENT; - } - seq = 0; /* we are already out of RCU mode */ inode = d_backing_inode(path.dentry); } @@ -2568,6 +2561,26 @@ struct dentry *lookup_one_len_unlocked(const char *name, } EXPORT_SYMBOL(lookup_one_len_unlocked); +/* + * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) + * on negatives. Returns known positive or ERR_PTR(); that's what + * most of the users want. Note that pinned negative with unlocked parent + * _can_ become positive at any time, so callers of lookup_one_len_unlocked() + * need to be very careful; pinned positives have ->d_inode stable, so + * this one avoids such problems. + */ +struct dentry *lookup_positive_unlocked(const char *name, + struct dentry *base, int len) +{ + struct dentry *ret = lookup_one_len_unlocked(name, base, len); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; +} +EXPORT_SYMBOL(lookup_positive_unlocked); + #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { @@ -2662,7 +2675,7 @@ mountpoint_last(struct nameidata *nd) return PTR_ERR(path.dentry); } } - if (d_is_negative(path.dentry)) { + if (d_flags_negative(smp_load_acquire(&path.dentry->d_flags))) { dput(path.dentry); return -ENOENT; } @@ -3356,11 +3369,6 @@ static int do_last(struct nameidata *nd, if (unlikely(error < 0)) return error; - if (unlikely(d_is_negative(path.dentry))) { - path_to_nameidata(&path, nd); - return -ENOENT; - } - /* * create/update audit record if it already exists. */ diff --git a/fs/namespace.c b/fs/namespace.c index 2adfe7b166a3..be601d3a8008 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2356,7 +2356,7 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return file; } -SYSCALL_DEFINE3(open_tree, int, dfd, const char *, filename, unsigned, flags) +SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { struct file *file; struct path path; @@ -3325,8 +3325,8 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name) } EXPORT_SYMBOL(mount_subtree); -int ksys_mount(const char __user *dev_name, const char __user *dir_name, - const char __user *type, unsigned long flags, void __user *data) +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; @@ -3359,12 +3359,6 @@ out_type: return ret; } -SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, - char __user *, type, unsigned long, flags, void __user *, data) -{ - return ksys_mount(dev_name, dir_name, type, flags, data); -} - /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. @@ -3514,8 +3508,8 @@ err_fsfd: * Note the flags value is a combination of MOVE_MOUNT_* flags. */ SYSCALL_DEFINE5(move_mount, - int, from_dfd, const char *, from_pathname, - int, to_dfd, const char *, to_pathname, + int, from_dfd, const char __user *, from_pathname, + int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { struct path from_path, to_path; diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index 8f34daf85f70..549350259840 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -72,8 +72,8 @@ struct cb_getattrres { uint32_t bitmap[2]; uint64_t size; uint64_t change_attr; - struct timespec ctime; - struct timespec mtime; + struct timespec64 ctime; + struct timespec64 mtime; }; struct cb_recallargs { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f39924ba050b..cd4c6bc81cae 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -26,7 +26,6 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, struct cb_getattrargs *args = argp; struct cb_getattrres *res = resp; struct nfs_delegation *delegation; - struct nfs_inode *nfsi; struct inode *inode; res->status = htonl(NFS4ERR_OP_NOT_IN_SESSION); @@ -47,17 +46,16 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, -ntohl(res->status)); goto out; } - nfsi = NFS_I(inode); rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) goto out_iput; res->size = i_size_read(inode); res->change_attr = delegation->change_attr; if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = timespec64_to_timespec(inode->i_ctime); - res->mtime = timespec64_to_timespec(inode->i_mtime); + res->ctime = inode->i_ctime; + res->mtime = inode->i_mtime; res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & args->bitmap[0]; res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 73a5a5ea2976..03a20f5716c7 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -627,7 +627,7 @@ static __be32 encode_attr_size(struct xdr_stream *xdr, const uint32_t *bitmap, u return 0; } -static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *time) +static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec64 *time) { __be32 *p; @@ -639,14 +639,14 @@ static __be32 encode_attr_time(struct xdr_stream *xdr, const struct timespec *ti return 0; } -static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +static __be32 encode_attr_ctime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) { if (!(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) return 0; return encode_attr_time(xdr,time); } -static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec *time) +static __be32 encode_attr_mtime(struct xdr_stream *xdr, const uint32_t *bitmap, const struct timespec64 *time) { if (!(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) return 0; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 30838304a0bf..02110a30a49e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -312,6 +312,12 @@ again: /* Match nfsv4 minorversion */ if (clp->cl_minorversion != data->minorversion) continue; + + /* Match request for a dedicated DS */ + if (test_bit(NFS_CS_DS, &data->init_flags) != + test_bit(NFS_CS_DS, &clp->cl_flags)) + continue; + /* Match the full socket address */ if (!rpc_cmp_addr_port(sap, clap)) /* Match all xprt_switch full socket addresses */ @@ -515,6 +521,10 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS; + if (test_bit(NFS_CS_NOPING, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NOPING; + if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_REUSEPORT; if (!IS_ERR(clp->cl_rpcclient)) return 0; @@ -662,6 +672,7 @@ static int nfs_init_server(struct nfs_server *server, .timeparms = &timeparms, .cred = server->cred, .nconnect = data->nfs_server.nconnect, + .init_flags = (1UL << NFS_CS_REUSEPORT), }; struct nfs_client *clp; int error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index af549d70ec50..fe57b2b5314a 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -199,7 +199,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation != NULL) { spin_lock(&delegation->lock); - if (delegation->inode != NULL) { + if (nfs4_is_valid_delegation(delegation, 0)) { nfs4_stateid_copy(&delegation->stateid, stateid); delegation->type = type; delegation->pagemod_limit = pagemod_limit; @@ -229,7 +229,6 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation * delegation->cred, &delegation->stateid, issync); - nfs_free_delegation(delegation); return res; } @@ -298,7 +297,10 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi, return NULL; spin_lock(&delegation->lock); - set_bit(NFS_DELEGATION_RETURNING, &delegation->flags); + if (!delegation->inode) { + spin_unlock(&delegation->lock); + return NULL; + } list_del_rcu(&delegation->super_list); delegation->inode = NULL; rcu_assign_pointer(nfsi->delegation, NULL); @@ -325,10 +327,12 @@ nfs_inode_detach_delegation(struct inode *inode) struct nfs_server *server = NFS_SERVER(inode); struct nfs_delegation *delegation; - delegation = nfs_start_delegation_return(nfsi); - if (delegation == NULL) - return NULL; - return nfs_detach_delegation(nfsi, delegation, server); + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); + if (delegation != NULL) + delegation = nfs_detach_delegation(nfsi, delegation, server); + rcu_read_unlock(); + return delegation; } static void @@ -339,6 +343,7 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation, delegation->stateid.seqid = update->stateid.seqid; smp_wmb(); delegation->type = update->type; + clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags); } } @@ -379,14 +384,18 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, spin_lock(&clp->cl_lock); old_delegation = rcu_dereference_protected(nfsi->delegation, lockdep_is_held(&clp->cl_lock)); - if (old_delegation != NULL) { - /* Is this an update of the existing delegation? */ - if (nfs4_stateid_match_other(&old_delegation->stateid, - &delegation->stateid)) { - nfs_update_inplace_delegation(old_delegation, - delegation); - goto out; - } + if (old_delegation == NULL) + goto add_new; + /* Is this an update of the existing delegation? */ + if (nfs4_stateid_match_other(&old_delegation->stateid, + &delegation->stateid)) { + spin_lock(&old_delegation->lock); + nfs_update_inplace_delegation(old_delegation, + delegation); + spin_unlock(&old_delegation->lock); + goto out; + } + if (!test_bit(NFS_DELEGATION_REVOKED, &old_delegation->flags)) { /* * Deal with broken servers that hand out two * delegations for the same file. @@ -405,11 +414,11 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, if (test_and_set_bit(NFS_DELEGATION_RETURNING, &old_delegation->flags)) goto out; - freeme = nfs_detach_delegation_locked(nfsi, - old_delegation, clp); - if (freeme == NULL) - goto out; } + freeme = nfs_detach_delegation_locked(nfsi, old_delegation, clp); + if (freeme == NULL) + goto out; +add_new: list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -424,8 +433,10 @@ out: spin_unlock(&clp->cl_lock); if (delegation != NULL) nfs_free_delegation(delegation); - if (freeme != NULL) + if (freeme != NULL) { nfs_do_return_delegation(inode, freeme, 0); + nfs_free_delegation(freeme); + } return status; } @@ -435,7 +446,6 @@ out: static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - struct nfs_inode *nfsi = NFS_I(inode); int err = 0; if (delegation == NULL) @@ -457,8 +467,6 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation nfs_abort_delegation_return(delegation, clp); goto out; } - if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode))) - goto out; err = nfs_do_return_delegation(inode, delegation, issync); out: @@ -469,8 +477,6 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) { bool ret = false; - if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - goto out; if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) { @@ -482,7 +488,10 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) ret = true; spin_unlock(&delegation->lock); } -out: + if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || + test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) + ret = false; + return ret; } @@ -585,19 +594,23 @@ restart: } /** - * nfs_inode_return_delegation_noreclaim - return delegation, don't reclaim opens + * nfs_inode_evict_delegation - return delegation, don't reclaim opens * @inode: inode to process * * Does not protect against delegation reclaims, therefore really only safe - * to be called from nfs4_clear_inode(). + * to be called from nfs4_clear_inode(). Guaranteed to always free + * the delegation structure. */ -void nfs_inode_return_delegation_noreclaim(struct inode *inode) +void nfs_inode_evict_delegation(struct inode *inode) { struct nfs_delegation *delegation; delegation = nfs_inode_detach_delegation(inode); - if (delegation != NULL) + if (delegation != NULL) { + set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); nfs_do_return_delegation(inode, delegation, 1); + nfs_free_delegation(delegation); + } } /** @@ -633,10 +646,18 @@ int nfs4_inode_return_delegation(struct inode *inode) */ int nfs4_inode_make_writeable(struct inode *inode) { - if (!nfs4_has_session(NFS_SERVER(inode)->nfs_client) || - !nfs4_check_delegation(inode, FMODE_WRITE)) - return nfs4_inode_return_delegation(inode); - return 0; + struct nfs_delegation *delegation; + + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (delegation == NULL || + (nfs4_has_session(NFS_SERVER(inode)->nfs_client) && + (delegation->type & FMODE_WRITE))) { + rcu_read_unlock(); + return 0; + } + rcu_read_unlock(); + return nfs4_inode_return_delegation(inode); } static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, @@ -744,10 +765,9 @@ static void nfs_mark_delegation_revoked(struct nfs_server *server, { set_bit(NFS_DELEGATION_REVOKED, &delegation->flags); delegation->stateid.type = NFS4_INVALID_STATEID_TYPE; - nfs_mark_return_delegation(server, delegation); } -static bool nfs_revoke_delegation(struct inode *inode, +static void nfs_revoke_delegation(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_delegation *delegation; @@ -761,29 +781,69 @@ static bool nfs_revoke_delegation(struct inode *inode, if (stateid == NULL) { nfs4_stateid_copy(&tmp, &delegation->stateid); stateid = &tmp; - } else if (!nfs4_stateid_match(stateid, &delegation->stateid)) - goto out; + } else { + if (!nfs4_stateid_match_other(stateid, &delegation->stateid)) + goto out; + spin_lock(&delegation->lock); + if (stateid->seqid) { + if (nfs4_stateid_is_newer(&delegation->stateid, stateid)) { + spin_unlock(&delegation->lock); + goto out; + } + delegation->stateid.seqid = stateid->seqid; + } + spin_unlock(&delegation->lock); + } nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); ret = true; out: rcu_read_unlock(); if (ret) nfs_inode_find_state_and_recover(inode, stateid); - return ret; } void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid) { + nfs_revoke_delegation(inode, stateid); +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + +void nfs_delegation_mark_returned(struct inode *inode, + const nfs4_stateid *stateid) +{ struct nfs_delegation *delegation; - if (!nfs_revoke_delegation(inode, stateid)) + if (!inode) return; - delegation = nfs_inode_detach_delegation(inode); - if (delegation) - nfs_free_delegation(delegation); + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (!delegation) + goto out_rcu_unlock; + + spin_lock(&delegation->lock); + if (!nfs4_stateid_match_other(stateid, &delegation->stateid)) + goto out_spin_unlock; + if (stateid->seqid) { + /* If delegation->stateid is newer, dont mark as returned */ + if (nfs4_stateid_is_newer(&delegation->stateid, stateid)) + goto out_clear_returning; + if (delegation->stateid.seqid != stateid->seqid) + delegation->stateid.seqid = stateid->seqid; + } + + nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation); + +out_clear_returning: + clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); +out_spin_unlock: + spin_unlock(&delegation->lock); +out_rcu_unlock: + rcu_read_unlock(); + + nfs_inode_find_state_and_recover(inode, stateid); } -EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); /** * nfs_expire_unused_delegation_types @@ -840,7 +900,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL) goto out_enoent; if (stateid != NULL && @@ -866,6 +926,7 @@ nfs_delegation_find_inode_server(struct nfs_server *server, list_for_each_entry_rcu(delegation, &server->delegations, super_list) { spin_lock(&delegation->lock); if (delegation->inode != NULL && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) && nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) { freeme = igrab(delegation->inode); if (freeme && nfs_sb_active(freeme->i_sb)) @@ -1140,7 +1201,8 @@ void nfs_inode_find_delegation_state_and_recover(struct inode *inode, rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation && - nfs4_stateid_match_other(&delegation->stateid, stateid)) { + nfs4_stateid_match_or_older(&delegation->stateid, stateid) && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation); found = true; } @@ -1189,7 +1251,9 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode) rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); if (delegation != NULL && - nfs4_stateid_match_other(dst, &delegation->stateid)) { + nfs4_stateid_match_other(dst, &delegation->stateid) && + nfs4_stateid_is_newer(&delegation->stateid, dst) && + !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) { dst->seqid = delegation->stateid.seqid; ret = true; } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 8b14d441e699..15d3484be028 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -43,7 +43,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, fmode_t type, const nfs4_stateid *stateid, unsigned long pagemod_limit); int nfs4_inode_return_delegation(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); -void nfs_inode_return_delegation_noreclaim(struct inode *inode); +void nfs_inode_evict_delegation(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); void nfs_server_return_all_delegations(struct nfs_server *); @@ -53,6 +53,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp); int nfs_client_return_marked_delegations(struct nfs_client *clp); int nfs_delegations_present(struct nfs_client *clp); void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid); +void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid); void nfs_delegation_mark_reclaim(struct nfs_client *clp); void nfs_delegation_reap_unclaimed(struct nfs_client *clp); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index deecb67638aa..3430d6891e89 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -105,6 +105,7 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid, ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label, NULL); if (ret) { dprintk("%s: getattr failed %d\n", __func__, ret); + trace_nfs_fh_to_dentry(sb, server_fh, fattr->fileid, ret); dentry = ERR_PTR(ret); goto out_free_label; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 95dc90570786..8eb731d9be3e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -649,7 +649,7 @@ out: out_swapfile: printk(KERN_INFO "NFS: attempt to write to active swap file!\n"); - return -EBUSY; + return -ETXTBSY; } EXPORT_SYMBOL_GPL(nfs_file_write); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2a03bfeec10a..b0b4b9f303fd 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -504,15 +504,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = timespec_to_timespec64(fattr->atime); + inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = timespec_to_timespec64(fattr->mtime); + inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -698,7 +698,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -709,14 +709,14 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = timespec_to_timespec64(fattr->atime); + inode->i_atime = fattr->atime; else if (attr->ia_valid & ATTR_ATIME_SET) inode->i_atime = attr->ia_atime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -725,14 +725,14 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = timespec_to_timespec64(fattr->mtime); + inode->i_mtime = fattr->mtime; else if (attr->ia_valid & ATTR_MTIME_SET) inode->i_mtime = attr->ia_mtime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -1351,7 +1351,7 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct timespec ts; + struct timespec64 ts; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -1361,18 +1361,18 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); } /* If we have atomic WCC data, we may update some attributes */ - ts = timespec64_to_timespec(inode->i_ctime); + ts = inode->i_ctime; if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) - && timespec_equal(&ts, &fattr->pre_ctime)) { - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + && timespec64_equal(&ts, &fattr->pre_ctime)) { + inode->i_ctime = fattr->ctime; } - ts = timespec64_to_timespec(inode->i_mtime); + ts = inode->i_mtime; if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) - && timespec_equal(&ts, &fattr->pre_mtime)) { - inode->i_mtime = timespec_to_timespec64(fattr->mtime); + && timespec64_equal(&ts, &fattr->pre_mtime)) { + inode->i_mtime = fattr->mtime; if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); } @@ -1398,7 +1398,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; - struct timespec ts; + struct timespec64 ts; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return 0; @@ -1425,12 +1425,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE; - ts = timespec64_to_timespec(inode->i_mtime); - if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&ts, &fattr->mtime)) + ts = inode->i_mtime; + if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; - ts = timespec64_to_timespec(inode->i_ctime); - if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&ts, &fattr->ctime)) + ts = inode->i_ctime; + if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1460,8 +1460,8 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_OTHER; - ts = timespec64_to_timespec(inode->i_atime); - if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&ts, &fattr->atime)) + ts = inode->i_atime; + if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) @@ -1733,12 +1733,12 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - fattr->pre_ctime = timespec64_to_timespec(inode->i_ctime); + fattr->pre_ctime = inode->i_ctime; fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { - fattr->pre_mtime = timespec64_to_timespec(inode->i_mtime); + fattr->pre_mtime = inode->i_mtime; fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && @@ -1899,7 +1899,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { - inode->i_mtime = timespec_to_timespec64(fattr->mtime); + inode->i_mtime = fattr->mtime; } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_MTIME @@ -1908,7 +1908,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { - inode->i_ctime = timespec_to_timespec64(fattr->ctime); + inode->i_ctime = fattr->ctime; } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_CTIME @@ -1946,7 +1946,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = timespec_to_timespec64(fattr->atime); + inode->i_atime = fattr->atime; else if (server->caps & NFS_CAP_ATIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATIME diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 447a3c17fa8e..24a65da58aa9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -713,7 +713,7 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) * 1024*1024*1024. */ static inline -u64 nfs_timespec_to_change_attr(const struct timespec *ts) +u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) { return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 9287eb666322..5e0e9d29f5c5 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -157,6 +157,9 @@ struct vfsmount *nfs_d_automount(struct path *path) if (IS_ERR(mnt)) goto out; + if (nfs_mountpoint_expiry_timeout < 0) + goto out; + mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index cbc17a203248..d94c7abdf25a 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -209,9 +209,9 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) * unsigned int useconds; * }; */ -static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) +static __be32 *xdr_encode_time(__be32 *p, const struct timespec64 *timep) { - *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32((u32)timep->tv_sec); if (timep->tv_nsec != 0) *p++ = cpu_to_be32(timep->tv_nsec / NSEC_PER_USEC); else @@ -227,14 +227,14 @@ static __be32 *xdr_encode_time(__be32 *p, const struct timespec *timep) * Illustrated" by Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5. */ static __be32 *xdr_encode_current_server_time(__be32 *p, - const struct timespec *timep) + const struct timespec64 *timep) { *p++ = cpu_to_be32(timep->tv_sec); *p++ = cpu_to_be32(1000000); return p; } -static __be32 *xdr_decode_time(__be32 *p, struct timespec *timep) +static __be32 *xdr_decode_time(__be32 *p, struct timespec64 *timep) { timep->tv_sec = be32_to_cpup(p++); timep->tv_nsec = be32_to_cpup(p++) * NSEC_PER_USEC; @@ -339,7 +339,6 @@ static __be32 *xdr_time_not_set(__be32 *p) static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr, struct user_namespace *userns) { - struct timespec ts; __be32 *p; p = xdr_reserve_space(xdr, NFS_sattr_sz << 2); @@ -362,19 +361,15 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr, *p++ = cpu_to_be32(NFS2_SATTR_NOT_SET); if (attr->ia_valid & ATTR_ATIME_SET) { - ts = timespec64_to_timespec(attr->ia_atime); - p = xdr_encode_time(p, &ts); + p = xdr_encode_time(p, &attr->ia_atime); } else if (attr->ia_valid & ATTR_ATIME) { - ts = timespec64_to_timespec(attr->ia_atime); - p = xdr_encode_current_server_time(p, &ts); + p = xdr_encode_current_server_time(p, &attr->ia_atime); } else p = xdr_time_not_set(p); if (attr->ia_valid & ATTR_MTIME_SET) { - ts = timespec64_to_timespec(attr->ia_atime); - xdr_encode_time(p, &ts); + xdr_encode_time(p, &attr->ia_mtime); } else if (attr->ia_valid & ATTR_MTIME) { - ts = timespec64_to_timespec(attr->ia_mtime); - xdr_encode_current_server_time(p, &ts); + xdr_encode_current_server_time(p, &attr->ia_mtime); } else xdr_time_not_set(p); } diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 148ceb74d27c..223904bc40a7 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -106,7 +106,10 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, cl_init.nconnect = mds_clp->cl_nconnect; if (mds_srv->flags & NFS_MOUNT_NORESVPORT) - set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + + __set_bit(NFS_CS_NOPING, &cl_init.init_flags); + __set_bit(NFS_CS_DS, &cl_init.init_flags); /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 602767850b36..927eb680f161 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -456,14 +456,14 @@ static void zero_nfs_fh3(struct nfs_fh *fh) * uint32 nseconds; * }; */ -static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec *timep) +static __be32 *xdr_encode_nfstime3(__be32 *p, const struct timespec64 *timep) { - *p++ = cpu_to_be32(timep->tv_sec); + *p++ = cpu_to_be32((u32)timep->tv_sec); *p++ = cpu_to_be32(timep->tv_nsec); return p; } -static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) +static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec64 *timep) { timep->tv_sec = be32_to_cpup(p++); timep->tv_nsec = be32_to_cpup(p++); @@ -533,7 +533,6 @@ static __be32 *xdr_decode_nfstime3(__be32 *p, struct timespec *timep) static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr, struct user_namespace *userns) { - struct timespec ts; u32 nbytes; __be32 *p; @@ -583,10 +582,8 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr, *p++ = xdr_zero; if (attr->ia_valid & ATTR_ATIME_SET) { - struct timespec ts; *p++ = xdr_two; - ts = timespec64_to_timespec(attr->ia_atime); - p = xdr_encode_nfstime3(p, &ts); + p = xdr_encode_nfstime3(p, &attr->ia_atime); } else if (attr->ia_valid & ATTR_ATIME) { *p++ = xdr_one; } else @@ -594,8 +591,7 @@ static void encode_sattr3(struct xdr_stream *xdr, const struct iattr *attr, if (attr->ia_valid & ATTR_MTIME_SET) { *p++ = xdr_two; - ts = timespec64_to_timespec(attr->ia_mtime); - xdr_encode_nfstime3(p, &ts); + xdr_encode_nfstime3(p, &attr->ia_mtime); } else if (attr->ia_valid & ATTR_MTIME) { *p = xdr_one; } else diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 901cca7542f9..c891af949886 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -13,8 +13,10 @@ #define PNFS_LAYOUTSTATS_MAXDEV (4) /* nfs4.2proc.c */ +#ifdef CONFIG_NFS_V4_2 int nfs42_proc_allocate(struct file *, loff_t, loff_t); -ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t); +ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, + struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, @@ -23,5 +25,16 @@ int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t); int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, const struct nfs42_layout_error *errors, size_t n); +int nfs42_proc_copy_notify(struct file *, struct file *, + struct nfs42_copy_notify_res *); +static inline bool nfs42_files_from_same_server(struct file *in, + struct file *out) +{ + struct nfs_client *c_in = (NFS_SERVER(file_inode(in)))->nfs_client; + struct nfs_client *c_out = (NFS_SERVER(file_inode(out)))->nfs_client; + return nfs4_check_serverowner_major_id(c_in->cl_serverowner, + c_out->cl_serverowner); +} +#endif /* CONFIG_NFS_V4_2 */ #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 5196bfa7894d..1fe83e0f663e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -3,6 +3,7 @@ * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> */ #include <linux/fs.h> +#include <linux/sunrpc/addr.h> #include <linux/sunrpc/sched.h> #include <linux/nfs.h> #include <linux/nfs3.h> @@ -15,10 +16,30 @@ #include "pnfs.h" #include "nfs4session.h" #include "internal.h" +#include "delegation.h" #define NFSDBG_FACILITY NFSDBG_PROC static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std); +static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr) +{ + struct nfs_client *clp = (NFS_SERVER(file_inode(filep)))->nfs_client; + unsigned short port = 2049; + + rcu_read_lock(); + naddr->netid_len = scnprintf(naddr->netid, + sizeof(naddr->netid), "%s", + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_NETID)); + naddr->addr_len = scnprintf(naddr->addr, + sizeof(naddr->addr), + "%s.%u.%u", + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR), + port >> 8, port & 255); + rcu_read_unlock(); +} + static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, struct nfs_lock_context *lock, loff_t offset, loff_t len) { @@ -28,7 +49,7 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, - .falloc_bitmask = server->cache_consistency_bitmask, + .falloc_bitmask = nfs4_fattr_bitmap, }; struct nfs42_falloc_res res = { .falloc_server = server, @@ -132,22 +153,26 @@ out_unlock: } static int handle_async_copy(struct nfs42_copy_res *res, - struct nfs_server *server, + struct nfs_server *dst_server, + struct nfs_server *src_server, struct file *src, struct file *dst, - nfs4_stateid *src_stateid) + nfs4_stateid *src_stateid, + bool *restart) { struct nfs4_copy_state *copy, *tmp_copy; int status = NFS4_OK; bool found_pending = false; - struct nfs_open_context *ctx = nfs_file_open_context(dst); + struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); + struct nfs_open_context *src_ctx = nfs_file_open_context(src); copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_NOFS); if (!copy) return -ENOMEM; - spin_lock(&server->nfs_client->cl_lock); - list_for_each_entry(tmp_copy, &server->nfs_client->pending_cb_stateids, + spin_lock(&dst_server->nfs_client->cl_lock); + list_for_each_entry(tmp_copy, + &dst_server->nfs_client->pending_cb_stateids, copies) { if (memcmp(&res->write_res.stateid, &tmp_copy->stateid, NFS4_STATEID_SIZE)) @@ -157,7 +182,7 @@ static int handle_async_copy(struct nfs42_copy_res *res, break; } if (found_pending) { - spin_unlock(&server->nfs_client->cl_lock); + spin_unlock(&dst_server->nfs_client->cl_lock); kfree(copy); copy = tmp_copy; goto out; @@ -165,19 +190,32 @@ static int handle_async_copy(struct nfs42_copy_res *res, memcpy(©->stateid, &res->write_res.stateid, NFS4_STATEID_SIZE); init_completion(©->completion); - copy->parent_state = ctx->state; + copy->parent_dst_state = dst_ctx->state; + copy->parent_src_state = src_ctx->state; - list_add_tail(©->copies, &server->ss_copies); - spin_unlock(&server->nfs_client->cl_lock); + list_add_tail(©->copies, &dst_server->ss_copies); + spin_unlock(&dst_server->nfs_client->cl_lock); + + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_add_tail(©->src_copies, &src_server->ss_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } status = wait_for_completion_interruptible(©->completion); - spin_lock(&server->nfs_client->cl_lock); + spin_lock(&dst_server->nfs_client->cl_lock); list_del_init(©->copies); - spin_unlock(&server->nfs_client->cl_lock); + spin_unlock(&dst_server->nfs_client->cl_lock); + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_del_init(©->src_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } if (status == -ERESTARTSYS) { goto out_cancel; - } else if (copy->flags) { + } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) { status = -EAGAIN; + *restart = true; goto out_cancel; } out: @@ -185,12 +223,14 @@ out: memcpy(&res->write_res.verifier, ©->verf, sizeof(copy->verf)); status = -copy->error; +out_free: kfree(copy); return status; out_cancel: nfs42_do_offload_cancel_async(dst, ©->stateid); - kfree(copy); - return status; + if (!nfs42_files_from_same_server(src, dst)) + nfs42_do_offload_cancel_async(src, src_stateid); + goto out_free; } static int process_copy_commit(struct file *dst, loff_t pos_dst, @@ -222,7 +262,10 @@ static ssize_t _nfs42_proc_copy(struct file *src, struct file *dst, struct nfs_lock_context *dst_lock, struct nfs42_copy_args *args, - struct nfs42_copy_res *res) + struct nfs42_copy_res *res, + struct nl4_server *nss, + nfs4_stateid *cnr_stateid, + bool *restart) { struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY], @@ -230,17 +273,23 @@ static ssize_t _nfs42_proc_copy(struct file *src, .rpc_resp = res, }; struct inode *dst_inode = file_inode(dst); - struct nfs_server *server = NFS_SERVER(dst_inode); + struct inode *src_inode = file_inode(src); + struct nfs_server *dst_server = NFS_SERVER(dst_inode); + struct nfs_server *src_server = NFS_SERVER(src_inode); loff_t pos_src = args->src_pos; loff_t pos_dst = args->dst_pos; size_t count = args->count; ssize_t status; - status = nfs4_set_rw_stateid(&args->src_stateid, src_lock->open_context, - src_lock, FMODE_READ); - if (status) - return status; - + if (nss) { + args->cp_src = nss; + nfs4_stateid_copy(&args->src_stateid, cnr_stateid); + } else { + status = nfs4_set_rw_stateid(&args->src_stateid, + src_lock->open_context, src_lock, FMODE_READ); + if (status) + return status; + } status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping, pos_src, pos_src + (loff_t)count - 1); if (status) @@ -262,13 +311,15 @@ static ssize_t _nfs42_proc_copy(struct file *src, if (!res->commit_res.verf) return -ENOMEM; } + set_bit(NFS_CLNT_SRC_SSC_COPY_STATE, + &src_lock->open_context->state->flags); set_bit(NFS_CLNT_DST_SSC_COPY_STATE, &dst_lock->open_context->state->flags); - status = nfs4_call_sync(server->client, server, &msg, + status = nfs4_call_sync(dst_server->client, dst_server, &msg, &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) - server->caps &= ~NFS_CAP_COPY; + dst_server->caps &= ~NFS_CAP_COPY; if (status) goto out; @@ -280,8 +331,8 @@ static ssize_t _nfs42_proc_copy(struct file *src, } if (!res->synchronous) { - status = handle_async_copy(res, server, src, dst, - &args->src_stateid); + status = handle_async_copy(res, dst_server, src_server, src, + dst, &args->src_stateid, restart); if (status) return status; } @@ -304,8 +355,9 @@ out: } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, - struct file *dst, loff_t pos_dst, - size_t count) + struct file *dst, loff_t pos_dst, size_t count, + struct nl4_server *nss, + nfs4_stateid *cnr_stateid, bool sync) { struct nfs_server *server = NFS_SERVER(file_inode(dst)); struct nfs_lock_context *src_lock; @@ -316,7 +368,7 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, .dst_fh = NFS_FH(file_inode(dst)), .dst_pos = pos_dst, .count = count, - .sync = false, + .sync = sync, }; struct nfs42_copy_res res; struct nfs4_exception src_exception = { @@ -328,6 +380,7 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, .stateid = &args.dst_stateid, }; ssize_t err, err2; + bool restart = false; src_lock = nfs_get_lock_context(nfs_file_open_context(src)); if (IS_ERR(src_lock)) @@ -347,21 +400,33 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, inode_lock(file_inode(dst)); err = _nfs42_proc_copy(src, src_lock, dst, dst_lock, - &args, &res); + &args, &res, + nss, cnr_stateid, &restart); inode_unlock(file_inode(dst)); if (err >= 0) break; - if (err == -ENOTSUPP) { + if (err == -ENOTSUPP && + nfs42_files_from_same_server(src, dst)) { err = -EOPNOTSUPP; break; } else if (err == -EAGAIN) { - dst_exception.retry = 1; - continue; + if (!restart) { + dst_exception.retry = 1; + continue; + } + break; } else if (err == -NFS4ERR_OFFLOAD_NO_REQS && !args.sync) { args.sync = true; dst_exception.retry = 1; continue; + } else if ((err == -ESTALE || + err == -NFS4ERR_OFFLOAD_DENIED || + err == -ENOTSUPP) && + !nfs42_files_from_same_server(src, dst)) { + nfs42_do_offload_cancel_async(src, &args.src_stateid); + err = -EOPNOTSUPP; + break; } err2 = nfs4_handle_exception(server, err, &src_exception); @@ -459,6 +524,76 @@ static int nfs42_do_offload_cancel_async(struct file *dst, return status; } +static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, + struct nfs42_copy_notify_args *args, + struct nfs42_copy_notify_res *res) +{ + struct nfs_server *src_server = NFS_SERVER(file_inode(src)); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COPY_NOTIFY], + .rpc_argp = args, + .rpc_resp = res, + }; + int status; + struct nfs_open_context *ctx; + struct nfs_lock_context *l_ctx; + + ctx = get_nfs_open_context(nfs_file_open_context(src)); + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) + return PTR_ERR(l_ctx); + + status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx, + FMODE_READ); + nfs_put_lock_context(l_ctx); + if (status) + return status; + + status = nfs4_call_sync(src_server->client, src_server, &msg, + &args->cna_seq_args, &res->cnr_seq_res, 0); + if (status == -ENOTSUPP) + src_server->caps &= ~NFS_CAP_COPY_NOTIFY; + + put_nfs_open_context(nfs_file_open_context(src)); + return status; +} + +int nfs42_proc_copy_notify(struct file *src, struct file *dst, + struct nfs42_copy_notify_res *res) +{ + struct nfs_server *src_server = NFS_SERVER(file_inode(src)); + struct nfs42_copy_notify_args *args; + struct nfs4_exception exception = { + .inode = file_inode(src), + }; + int status; + + if (!(src_server->caps & NFS_CAP_COPY_NOTIFY)) + return -EOPNOTSUPP; + + args = kzalloc(sizeof(struct nfs42_copy_notify_args), GFP_NOFS); + if (args == NULL) + return -ENOMEM; + + args->cna_src_fh = NFS_FH(file_inode(src)), + args->cna_dst.nl4_type = NL4_NETADDR; + nfs42_set_netaddr(dst, &args->cna_dst.u.nl4_addr); + exception.stateid = &args->cna_src_stateid; + + do { + status = _nfs42_proc_copy_notify(src, dst, args, res); + if (status == -ENOTSUPP) { + status = -EOPNOTSUPP; + goto out; + } + status = nfs4_handle_exception(src_server, status, &exception); + } while (exception.retry); + +out: + kfree(args); + return status; +} + static loff_t _nfs42_proc_llseek(struct file *filep, struct nfs_lock_context *lock, loff_t offset, int whence) { diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index aed865a84629..c03f3246d6c5 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -21,7 +21,10 @@ #define encode_copy_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ - 2 + 2 + 2 + 1 + 1 + 1) + 2 + 2 + 2 + 1 + 1 + 1 +\ + 1 + /* One cnr_source_server */\ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define decode_copy_maxsz (op_decode_hdr_maxsz + \ NFS42_WRITE_RES_SIZE + \ 1 /* cr_consecutive */ + \ @@ -29,6 +32,16 @@ #define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE)) #define decode_offload_cancel_maxsz (op_decode_hdr_maxsz) +#define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) +#define decode_copy_notify_maxsz (op_decode_hdr_maxsz + \ + 3 + /* cnr_lease_time */\ + XDR_QUADLEN(NFS4_STATEID_SIZE) + \ + 1 + /* Support 1 cnr_source_server */\ + 1 + /* nl4_type */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT)) #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ encode_fallocate_maxsz) #define decode_deallocate_maxsz (op_decode_hdr_maxsz) @@ -99,6 +112,12 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_offload_cancel_maxsz) +#define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_copy_notify_maxsz) +#define NFS4_dec_copy_notify_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_copy_notify_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -166,6 +185,26 @@ static void encode_allocate(struct xdr_stream *xdr, encode_fallocate(xdr, args); } +static void encode_nl4_server(struct xdr_stream *xdr, + const struct nl4_server *ns) +{ + encode_uint32(xdr, ns->nl4_type); + switch (ns->nl4_type) { + case NL4_NAME: + case NL4_URL: + encode_string(xdr, ns->u.nl4_str_sz, ns->u.nl4_str); + break; + case NL4_NETADDR: + encode_string(xdr, ns->u.nl4_addr.netid_len, + ns->u.nl4_addr.netid); + encode_string(xdr, ns->u.nl4_addr.addr_len, + ns->u.nl4_addr.addr); + break; + default: + WARN_ON_ONCE(1); + } +} + static void encode_copy(struct xdr_stream *xdr, const struct nfs42_copy_args *args, struct compound_hdr *hdr) @@ -180,7 +219,12 @@ static void encode_copy(struct xdr_stream *xdr, encode_uint32(xdr, 1); /* consecutive = true */ encode_uint32(xdr, args->sync); - encode_uint32(xdr, 0); /* src server list */ + if (args->cp_src == NULL) { /* intra-ssc */ + encode_uint32(xdr, 0); /* no src server list */ + return; + } + encode_uint32(xdr, 1); /* supporting 1 server */ + encode_nl4_server(xdr, args->cp_src); } static void encode_offload_cancel(struct xdr_stream *xdr, @@ -191,6 +235,15 @@ static void encode_offload_cancel(struct xdr_stream *xdr, encode_nfs4_stateid(xdr, &args->osa_stateid); } +static void encode_copy_notify(struct xdr_stream *xdr, + const struct nfs42_copy_notify_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_COPY_NOTIFY, decode_copy_notify_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->cna_src_stateid); + encode_nl4_server(xdr, &args->cna_dst); +} + static void encode_deallocate(struct xdr_stream *xdr, const struct nfs42_falloc_args *args, struct compound_hdr *hdr) @@ -355,6 +408,25 @@ static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req, } /* + * Encode COPY_NOTIFY request + */ +static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_copy_notify_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->cna_seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->cna_seq_args, &hdr); + encode_putfh(xdr, args->cna_src_fh, &hdr); + encode_copy_notify(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode DEALLOCATE request */ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, @@ -490,6 +562,58 @@ static int decode_write_response(struct xdr_stream *xdr, return decode_verifier(xdr, &res->verifier.verifier); } +static int decode_nl4_server(struct xdr_stream *xdr, struct nl4_server *ns) +{ + struct nfs42_netaddr *naddr; + uint32_t dummy; + char *dummy_str; + __be32 *p; + int status; + + /* nl_type */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + ns->nl4_type = be32_to_cpup(p); + switch (ns->nl4_type) { + case NL4_NAME: + case NL4_URL: + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > NFS4_OPAQUE_LIMIT)) + return -EIO; + memcpy(&ns->u.nl4_str, dummy_str, dummy); + ns->u.nl4_str_sz = dummy; + break; + case NL4_NETADDR: + naddr = &ns->u.nl4_addr; + + /* netid string */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > RPCBIND_MAXNETIDLEN)) + return -EIO; + naddr->netid_len = dummy; + memcpy(naddr->netid, dummy_str, naddr->netid_len); + + /* uaddr string */ + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; + if (unlikely(dummy > RPCBIND_MAXUADDRLEN)) + return -EIO; + naddr->addr_len = dummy; + memcpy(naddr->addr, dummy_str, naddr->addr_len); + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + return 0; +} + static int decode_copy_requirements(struct xdr_stream *xdr, struct nfs42_copy_res *res) { __be32 *p; @@ -529,6 +653,42 @@ static int decode_offload_cancel(struct xdr_stream *xdr, return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL); } +static int decode_copy_notify(struct xdr_stream *xdr, + struct nfs42_copy_notify_res *res) +{ + __be32 *p; + int status, count; + + status = decode_op_hdr(xdr, OP_COPY_NOTIFY); + if (status) + return status; + /* cnr_lease_time */ + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + return -EIO; + p = xdr_decode_hyper(p, &res->cnr_lease_time.seconds); + res->cnr_lease_time.nseconds = be32_to_cpup(p); + + status = decode_opaque_fixed(xdr, &res->cnr_stateid, NFS4_STATEID_SIZE); + if (unlikely(status)) + return -EIO; + + /* number of source addresses */ + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + return -EIO; + + count = be32_to_cpup(p); + if (count > 1) + pr_warn("NFS: %s: nsvr %d > Supported. Use first servers\n", + __func__, count); + + status = decode_nl4_server(xdr, &res->cnr_src); + if (unlikely(status)) + return -EIO; + return 0; +} + static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_DEALLOCATE); @@ -657,6 +817,32 @@ out: } /* + * Decode COPY_NOTIFY response + */ +static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_copy_notify_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->cnr_seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_copy_notify(xdr, res); + +out: + return status; +} + +/* * Decode DEALLOCATE request */ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 16b2e5cc3e94..a7a73b1d1fec 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -166,9 +166,9 @@ enum { NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */ -#ifdef CONFIG_NFS_V4_2 NFS_CLNT_DST_SSC_COPY_STATE, /* dst server open state on client*/ -#endif /* CONFIG_NFS_V4_2 */ + NFS_CLNT_SRC_SSC_COPY_STATE, /* src server open state on client*/ + NFS_SRV_SSC_COPY_STATE, /* ssc state on the dst server */ }; struct nfs4_state { @@ -311,6 +311,13 @@ extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, fmode_t fmode); +extern int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, struct nfs4_label *label, + struct inode *inode); +extern int update_open_stateid(struct nfs4_state *state, + const nfs4_stateid *open_stateid, + const nfs4_stateid *deleg_stateid, + fmode_t fmode); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); @@ -445,6 +452,8 @@ extern void nfs4_set_lease_period(struct nfs_client *clp, /* nfs4state.c */ +extern const nfs4_stateid current_stateid; + const struct cred *nfs4_get_clid_cred(struct nfs_client *clp); const struct cred *nfs4_get_machine_cred(struct nfs_client *clp); const struct cred *nfs4_get_renew_cred(struct nfs_client *clp); @@ -457,6 +466,8 @@ int nfs41_discover_server_trunking(struct nfs_client *clp, struct nfs_client **, const struct cred *); extern void nfs4_schedule_session_recovery(struct nfs4_session *, int); extern void nfs41_notify_server(struct nfs_client *); +bool nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, + struct nfs41_server_owner *o2); #else static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) { @@ -572,6 +583,12 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline bool nfs4_stateid_match_or_older(const nfs4_stateid *dst, const nfs4_stateid *src) +{ + return nfs4_stateid_match_other(dst, src) && + !(src->seqid && nfs4_stateid_is_newer(dst, src)); +} + static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1) { u32 seqid = be32_to_cpu(s1->seqid); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index da6204025a2d..460d6251c405 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -629,7 +629,7 @@ out: /* * Returns true if the server major ids match */ -static bool +bool nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, struct nfs41_server_owner *o2) { @@ -879,14 +879,17 @@ static int nfs4_set_client(struct nfs_server *server, }; struct nfs_client *clp; - if (minorversion > 0 && proto == XPRT_TRANSPORT_TCP) + if (minorversion == 0) + __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags); + else if (proto == XPRT_TRANSPORT_TCP) cl_init.nconnect = nconnect; + if (server->flags & NFS_MOUNT_NORESVPORT) - set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); if (server->options & NFS_OPTION_MIGRATION) - set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); + __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status)) - set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); + __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); server->port = rpc_get_port(addr); /* Allocate or find a client reference we can use */ diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 339663d04bf8..620de905cba9 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -133,14 +133,55 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t count, unsigned int flags) { + struct nfs42_copy_notify_res *cn_resp = NULL; + struct nl4_server *nss = NULL; + nfs4_stateid *cnrs = NULL; + ssize_t ret; + bool sync = false; + /* Only offload copy if superblock is the same */ - if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) + if (file_in->f_op != &nfs4_file_operations) return -EXDEV; if (!nfs_server_capable(file_inode(file_out), NFS_CAP_COPY)) return -EOPNOTSUPP; if (file_inode(file_in) == file_inode(file_out)) return -EOPNOTSUPP; - return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count); + /* if the copy size if smaller than 2 RPC payloads, make it + * synchronous + */ + if (count <= 2 * NFS_SERVER(file_inode(file_in))->rsize) + sync = true; +retry: + if (!nfs42_files_from_same_server(file_in, file_out)) { + /* for inter copy, if copy size if smaller than 12 RPC + * payloads, fallback to traditional copy. There are + * 14 RPCs during an NFSv4.x mount between source/dest + * servers. + */ + if (sync || + count <= 14 * NFS_SERVER(file_inode(file_in))->rsize) + return -EOPNOTSUPP; + cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res), + GFP_NOFS); + if (unlikely(cn_resp == NULL)) + return -ENOMEM; + + ret = nfs42_proc_copy_notify(file_in, file_out, cn_resp); + if (ret) { + ret = -EOPNOTSUPP; + goto out; + } + nss = &cn_resp->cnr_src; + cnrs = &cn_resp->cnr_stateid; + } + ret = nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count, + nss, cnrs, sync); +out: + if (!nfs42_files_from_same_server(file_in, file_out)) + kfree(cn_resp); + if (ret == -EAGAIN) + goto retry; + return ret; } static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in, @@ -263,6 +304,102 @@ out_unlock: out: return ret < 0 ? ret : count; } + +static int read_name_gen = 1; +#define SSC_READ_NAME_BODY "ssc_read_%d" + +struct file * +nfs42_ssc_open(struct vfsmount *ss_mnt, struct nfs_fh *src_fh, + nfs4_stateid *stateid) +{ + struct nfs_fattr fattr; + struct file *filep, *res; + struct nfs_server *server; + struct inode *r_ino = NULL; + struct nfs_open_context *ctx; + struct nfs4_state_owner *sp; + char *read_name = NULL; + int len, status = 0; + + server = NFS_SERVER(ss_mnt->mnt_root->d_inode); + + nfs_fattr_init(&fattr); + + status = nfs4_proc_getattr(server, src_fh, &fattr, NULL, NULL); + if (status < 0) { + res = ERR_PTR(status); + goto out; + } + + res = ERR_PTR(-ENOMEM); + len = strlen(SSC_READ_NAME_BODY) + 16; + read_name = kzalloc(len, GFP_NOFS); + if (read_name == NULL) + goto out; + snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++); + + r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, &fattr, + NULL); + if (IS_ERR(r_ino)) { + res = ERR_CAST(r_ino); + goto out_free_name; + } + + filep = alloc_file_pseudo(r_ino, ss_mnt, read_name, FMODE_READ, + r_ino->i_fop); + if (IS_ERR(filep)) { + res = ERR_CAST(filep); + goto out_free_name; + } + filep->f_mode |= FMODE_READ; + + ctx = alloc_nfs_open_context(filep->f_path.dentry, filep->f_mode, + filep); + if (IS_ERR(ctx)) { + res = ERR_CAST(ctx); + goto out_filep; + } + + res = ERR_PTR(-EINVAL); + sp = nfs4_get_state_owner(server, ctx->cred, GFP_KERNEL); + if (sp == NULL) + goto out_ctx; + + ctx->state = nfs4_get_open_state(r_ino, sp); + if (ctx->state == NULL) + goto out_stateowner; + + set_bit(NFS_SRV_SSC_COPY_STATE, &ctx->state->flags); + set_bit(NFS_OPEN_STATE, &ctx->state->flags); + memcpy(&ctx->state->open_stateid.other, &stateid->other, + NFS4_STATEID_OTHER_SIZE); + update_open_stateid(ctx->state, stateid, NULL, filep->f_mode); + + nfs_file_set_open_context(filep, ctx); + put_nfs_open_context(ctx); + + file_ra_state_init(&filep->f_ra, filep->f_mapping->host->i_mapping); + res = filep; +out_free_name: + kfree(read_name); +out: + return res; +out_stateowner: + nfs4_put_state_owner(sp); +out_ctx: + put_nfs_open_context(ctx); +out_filep: + fput(filep); + goto out_free_name; +} +EXPORT_SYMBOL_GPL(nfs42_ssc_open); +void nfs42_ssc_close(struct file *filep) +{ + struct nfs_open_context *ctx = nfs_file_open_context(filep); + + ctx->state->flags = 0; +} +EXPORT_SYMBOL_GPL(nfs42_ssc_close); #endif /* CONFIG_NFS_V4_2 */ const struct file_operations nfs4_file_operations = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index caacf5e7f5e1..76d37161409a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -91,7 +91,6 @@ struct nfs4_opendata; static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); -static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label, struct inode *inode); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode); static int nfs4_do_setattr(struct inode *inode, const struct cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, @@ -476,6 +475,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_PARTNER_NO_AUTH: if (inode != NULL && stateid != NULL) { nfs_inode_find_state_and_recover(inode, stateid); @@ -521,9 +521,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server, case -NFS4ERR_DEADSESSION: case -NFS4ERR_SEQ_FALSE_RETRY: case -NFS4ERR_SEQ_MISORDERED: - dprintk("%s ERROR: %d Reset session\n", __func__, - errorcode); - nfs4_schedule_session_recovery(clp->cl_session, errorcode); + /* Handled in nfs41_sequence_process() */ goto wait_on_recovery; #endif /* defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: @@ -782,6 +780,7 @@ static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; struct nfs_client *clp; + int status; int ret = 1; if (slot == NULL) @@ -793,8 +792,13 @@ static int nfs41_sequence_process(struct rpc_task *task, session = slot->table->session; trace_nfs4_sequence_done(session, res); + + status = res->sr_status; + if (task->tk_status == -NFS4ERR_DEADSESSION) + status = -NFS4ERR_DEADSESSION; + /* Check the SEQUENCE operation status */ - switch (res->sr_status) { + switch (status) { case 0: /* Mark this sequence number as having been acked */ nfs4_slot_sequence_acked(slot, slot->seq_nr); @@ -866,6 +870,10 @@ static int nfs41_sequence_process(struct rpc_task *task, */ slot->seq_nr = slot->seq_nr_highest_sent; goto out_retry; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + goto session_recover; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -876,8 +884,10 @@ out: out_noaction: return ret; session_recover: - nfs4_schedule_session_recovery(session, res->sr_status); - goto retry_nowait; + nfs4_schedule_session_recovery(session, status); + dprintk("%s ERROR: %d Reset session\n", __func__, status); + nfs41_sequence_free_slot(res); + goto out; retry_new_seq: ++slot->seq_nr; retry_nowait: @@ -1716,7 +1726,7 @@ static void nfs_state_clear_delegation(struct nfs4_state *state) write_sequnlock(&state->seqlock); } -static int update_open_stateid(struct nfs4_state *state, +int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *delegation, fmode_t fmode) @@ -1737,7 +1747,7 @@ static int update_open_stateid(struct nfs4_state *state, ret = 1; } - deleg_cur = rcu_dereference(nfsi->delegation); + deleg_cur = nfs4_get_valid_delegation(state->inode); if (deleg_cur == NULL) goto no_delegation; @@ -1749,7 +1759,7 @@ static int update_open_stateid(struct nfs4_state *state, if (delegation == NULL) delegation = &deleg_cur->stateid; - else if (!nfs4_stateid_match(&deleg_cur->stateid, delegation)) + else if (!nfs4_stateid_match_other(&deleg_cur->stateid, delegation)) goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); @@ -1796,7 +1806,7 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo fmode &= FMODE_READ|FMODE_WRITE; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(inode)->delegation); + delegation = nfs4_get_valid_delegation(inode); if (delegation == NULL || (delegation->type & fmode) == fmode) { rcu_read_unlock(); return; @@ -2188,7 +2198,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_session_recovery(server->nfs_client->cl_session, err); return -EAGAIN; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: @@ -4062,7 +4071,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); } -static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, +int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label, struct inode *inode) { @@ -5098,12 +5107,12 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid, const struct nfs_lock_context *l_ctx, fmode_t fmode) { - nfs4_stateid current_stateid; + nfs4_stateid _current_stateid; /* If the current stateid represents a lost lock, then exit */ - if (nfs4_set_rw_stateid(¤t_stateid, ctx, l_ctx, fmode) == -EIO) + if (nfs4_set_rw_stateid(&_current_stateid, ctx, l_ctx, fmode) == -EIO) return true; - return nfs4_stateid_match(stateid, ¤t_stateid); + return nfs4_stateid_match(stateid, &_current_stateid); } static bool nfs4_error_stateid_expired(int err) @@ -6196,10 +6205,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) task->tk_status = 0; break; case -NFS4ERR_OLD_STATEID: - if (nfs4_refresh_delegation_stateid(&data->stateid, data->inode)) - goto out_restart; - task->tk_status = 0; - break; + if (!nfs4_refresh_delegation_stateid(&data->stateid, data->inode)) + nfs4_stateid_seqid_inc(&data->stateid); + if (data->args.bitmask) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } + goto out_restart; case -NFS4ERR_ACCESS: if (data->args.bitmask) { data->args.bitmask = NULL; @@ -6214,6 +6226,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) if (exception.retry) goto out_restart; } + nfs_delegation_mark_returned(data->inode, data->args.stateid); data->rpc_status = task->tk_status; return; out_restart: @@ -6243,8 +6256,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) + if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) { + nfs4_sequence_done(task, &d_data->res.seq_res); return; + } lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL; if (lo && !pnfs_layout_is_valid(lo)) { @@ -7820,6 +7835,15 @@ nfs41_same_server_scope(struct nfs41_server_scope *a, static void nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) { + struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp; + struct nfs_client *clp = args->client; + + switch (task->tk_status) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + nfs4_schedule_session_recovery(clp->cl_session, + task->tk_status); + } } static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = { @@ -8867,8 +8891,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf case -NFS4ERR_BADSESSION: case -NFS4ERR_DEADSESSION: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: - nfs4_schedule_session_recovery(clp->cl_session, - task->tk_status); break; default: nfs4_schedule_lease_recovery(clp); @@ -9897,6 +9919,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_ALLOCATE | NFS_CAP_COPY | NFS_CAP_OFFLOAD_CANCEL + | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0c6d53dc3672..34552329233d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -60,6 +60,7 @@ #include "nfs4session.h" #include "pnfs.h" #include "netns.h" +#include "nfs4trace.h" #define NFSDBG_FACILITY NFSDBG_STATE @@ -1407,7 +1408,7 @@ nfs_state_find_lock_state_by_stateid(struct nfs4_state *state, list_for_each_entry(pos, &state->lock_states, ls_locks) { if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags)) continue; - if (nfs4_stateid_match_other(&pos->ls_stateid, stateid)) + if (nfs4_stateid_match_or_older(&pos->ls_stateid, stateid)) return pos; } return NULL; @@ -1441,12 +1442,13 @@ void nfs_inode_find_state_and_recover(struct inode *inode, state = ctx->state; if (state == NULL) continue; - if (nfs4_stateid_match_other(&state->stateid, stateid) && + if (nfs4_stateid_match_or_older(&state->stateid, stateid) && nfs4_state_mark_reclaim_nograce(clp, state)) { found = true; continue; } - if (nfs4_stateid_match_other(&state->open_stateid, stateid) && + if (test_bit(NFS_OPEN_STATE, &state->flags) && + nfs4_stateid_match_or_older(&state->open_stateid, stateid) && nfs4_state_mark_reclaim_nograce(clp, state)) { found = true; continue; @@ -1556,16 +1558,32 @@ static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state { struct nfs4_copy_state *copy; - if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags)) + if (!test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) && + !test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags)) return; spin_lock(&sp->so_server->nfs_client->cl_lock); list_for_each_entry(copy, &sp->so_server->ss_copies, copies) { - if (!nfs4_stateid_match_other(&state->stateid, ©->parent_state->stateid)) - continue; + if ((test_bit(NFS_CLNT_DST_SSC_COPY_STATE, &state->flags) && + !nfs4_stateid_match_other(&state->stateid, + ©->parent_dst_state->stateid))) + continue; copy->flags = 1; - complete(©->completion); - break; + if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE, + &state->flags)) { + clear_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags); + complete(©->completion); + } + } + list_for_each_entry(copy, &sp->so_server->ss_copies, src_copies) { + if ((test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags) && + !nfs4_stateid_match_other(&state->stateid, + ©->parent_src_state->stateid))) + continue; + copy->flags = 1; + if (test_and_clear_bit(NFS_CLNT_DST_SSC_COPY_STATE, + &state->flags)) + complete(©->completion); } spin_unlock(&sp->so_server->nfs_client->cl_lock); } @@ -1593,6 +1611,7 @@ static int __nfs4_reclaim_open_state(struct nfs4_state_owner *sp, struct nfs4_st if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) { spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { + trace_nfs4_state_lock_reclaim(state, lock); if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags)) pr_warn_ratelimited("NFS: %s: Lock reclaim failed!\n", __func__); } @@ -1609,6 +1628,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs struct nfs4_state *state; unsigned int loop = 0; int status = 0; +#ifdef CONFIG_NFS_V4_2 + bool found_ssc_copy_state = false; +#endif /* CONFIG_NFS_V4_2 */ /* Note: we rely on the sp->so_states list being ordered * so that we always reclaim open(O_RDWR) and/or open(O_WRITE) @@ -1628,6 +1650,13 @@ restart: continue; if (state->state == 0) continue; +#ifdef CONFIG_NFS_V4_2 + if (test_bit(NFS_SRV_SSC_COPY_STATE, &state->flags)) { + nfs4_state_mark_recovery_failed(state, -EIO); + found_ssc_copy_state = true; + continue; + } +#endif /* CONFIG_NFS_V4_2 */ refcount_inc(&state->count); spin_unlock(&sp->so_lock); status = __nfs4_reclaim_open_state(sp, state, ops); @@ -1682,6 +1711,10 @@ restart: } raw_write_seqcount_end(&sp->so_reclaim_seqcount); spin_unlock(&sp->so_lock); +#ifdef CONFIG_NFS_V4_2 + if (found_ssc_copy_state) + return -EIO; +#endif /* CONFIG_NFS_V4_2 */ return 0; out_err: nfs4_put_open_state(state); @@ -2508,6 +2541,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Ensure exclusive access to NFSv4 state */ do { + trace_nfs4_state_mgr(clp); clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) { section = "purge state"; @@ -2621,6 +2655,7 @@ static void nfs4_state_manager(struct nfs_client *clp) out_error: if (strlen(section)) section_sep = ": "; + trace_nfs4_state_mgr_failed(clp, section, status); pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 04c57066a11a..2c9cbade561a 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -92,8 +92,8 @@ static void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - /* If we are holding a delegation, return it! */ - nfs_inode_return_delegation_noreclaim(inode); + /* If we are holding a delegation, return and free it */ + nfs_inode_evict_delegation(inode); /* Note that above delegreturn would trigger pnfs return-on-close */ pnfs_return_layout(inode); pnfs_destroy_layout(NFS_I(inode)); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index b2f395fa7350..e60b6fbd5ada 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -562,6 +562,99 @@ TRACE_EVENT(nfs4_setup_sequence, ) ); +TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_RUNNING); +TRACE_DEFINE_ENUM(NFS4CLNT_CHECK_LEASE); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_EXPIRED); +TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_REBOOT); +TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_NOGRACE); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN); +TRACE_DEFINE_ENUM(NFS4CLNT_SESSION_RESET); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_CONFIRM); +TRACE_DEFINE_ENUM(NFS4CLNT_SERVER_SCOPE_MISMATCH); +TRACE_DEFINE_ENUM(NFS4CLNT_PURGE_STATE); +TRACE_DEFINE_ENUM(NFS4CLNT_BIND_CONN_TO_SESSION); +TRACE_DEFINE_ENUM(NFS4CLNT_MOVED); +TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED); +TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER); +TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING); + +#define show_nfs4_clp_state(state) \ + __print_flags(state, "|", \ + { NFS4CLNT_MANAGER_RUNNING, "MANAGER_RUNNING" }, \ + { NFS4CLNT_CHECK_LEASE, "CHECK_LEASE" }, \ + { NFS4CLNT_LEASE_EXPIRED, "LEASE_EXPIRED" }, \ + { NFS4CLNT_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \ + { NFS4CLNT_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \ + { NFS4CLNT_DELEGRETURN, "DELEGRETURN" }, \ + { NFS4CLNT_SESSION_RESET, "SESSION_RESET" }, \ + { NFS4CLNT_LEASE_CONFIRM, "LEASE_CONFIRM" }, \ + { NFS4CLNT_SERVER_SCOPE_MISMATCH, \ + "SERVER_SCOPE_MISMATCH" }, \ + { NFS4CLNT_PURGE_STATE, "PURGE_STATE" }, \ + { NFS4CLNT_BIND_CONN_TO_SESSION, \ + "BIND_CONN_TO_SESSION" }, \ + { NFS4CLNT_MOVED, "MOVED" }, \ + { NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \ + { NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \ + { NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \ + { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" }) + +TRACE_EVENT(nfs4_state_mgr, + TP_PROTO( + const struct nfs_client *clp + ), + + TP_ARGS(clp), + + TP_STRUCT__entry( + __field(unsigned long, state) + __string(hostname, clp->cl_hostname) + ), + + TP_fast_assign( + __entry->state = clp->cl_state; + __assign_str(hostname, clp->cl_hostname) + ), + + TP_printk( + "hostname=%s clp state=%s", __get_str(hostname), + show_nfs4_clp_state(__entry->state) + ) +) + +TRACE_EVENT(nfs4_state_mgr_failed, + TP_PROTO( + const struct nfs_client *clp, + const char *section, + int status + ), + + TP_ARGS(clp, section, status), + + TP_STRUCT__entry( + __field(unsigned long, error) + __field(unsigned long, state) + __string(hostname, clp->cl_hostname) + __string(section, section) + ), + + TP_fast_assign( + __entry->error = status; + __entry->state = clp->cl_state; + __assign_str(hostname, clp->cl_hostname); + __assign_str(section, section); + ), + + TP_printk( + "hostname=%s clp state=%s error=%ld (%s) section=%s", + __get_str(hostname), + show_nfs4_clp_state(__entry->state), -__entry->error, + show_nfsv4_errors(__entry->error), __get_str(section) + + ) +) + TRACE_EVENT(nfs4_xdr_status, TP_PROTO( const struct xdr_stream *xdr, @@ -929,6 +1022,88 @@ TRACE_EVENT(nfs4_set_lock, ) ); +TRACE_DEFINE_ENUM(LK_STATE_IN_USE); +TRACE_DEFINE_ENUM(NFS_DELEGATED_STATE); +TRACE_DEFINE_ENUM(NFS_OPEN_STATE); +TRACE_DEFINE_ENUM(NFS_O_RDONLY_STATE); +TRACE_DEFINE_ENUM(NFS_O_WRONLY_STATE); +TRACE_DEFINE_ENUM(NFS_O_RDWR_STATE); +TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_REBOOT); +TRACE_DEFINE_ENUM(NFS_STATE_RECLAIM_NOGRACE); +TRACE_DEFINE_ENUM(NFS_STATE_POSIX_LOCKS); +TRACE_DEFINE_ENUM(NFS_STATE_RECOVERY_FAILED); +TRACE_DEFINE_ENUM(NFS_STATE_MAY_NOTIFY_LOCK); +TRACE_DEFINE_ENUM(NFS_STATE_CHANGE_WAIT); +TRACE_DEFINE_ENUM(NFS_CLNT_DST_SSC_COPY_STATE); +TRACE_DEFINE_ENUM(NFS_CLNT_SRC_SSC_COPY_STATE); +TRACE_DEFINE_ENUM(NFS_SRV_SSC_COPY_STATE); + +#define show_nfs4_state_flags(flags) \ + __print_flags(flags, "|", \ + { LK_STATE_IN_USE, "IN_USE" }, \ + { NFS_DELEGATED_STATE, "DELEGATED" }, \ + { NFS_OPEN_STATE, "OPEN" }, \ + { NFS_O_RDONLY_STATE, "O_RDONLY" }, \ + { NFS_O_WRONLY_STATE, "O_WRONLY" }, \ + { NFS_O_RDWR_STATE, "O_RDWR" }, \ + { NFS_STATE_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \ + { NFS_STATE_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \ + { NFS_STATE_POSIX_LOCKS, "POSIX_LOCKS" }, \ + { NFS_STATE_RECOVERY_FAILED, "RECOVERY_FAILED" }, \ + { NFS_STATE_MAY_NOTIFY_LOCK, "MAY_NOTIFY_LOCK" }, \ + { NFS_STATE_CHANGE_WAIT, "CHANGE_WAIT" }, \ + { NFS_CLNT_DST_SSC_COPY_STATE, "CLNT_DST_SSC_COPY" }, \ + { NFS_CLNT_SRC_SSC_COPY_STATE, "CLNT_SRC_SSC_COPY" }, \ + { NFS_SRV_SSC_COPY_STATE, "SRV_SSC_COPY" }) + +#define show_nfs4_lock_flags(flags) \ + __print_flags(flags, "|", \ + { BIT(NFS_LOCK_INITIALIZED), "INITIALIZED" }, \ + { BIT(NFS_LOCK_LOST), "LOST" }) + +TRACE_EVENT(nfs4_state_lock_reclaim, + TP_PROTO( + const struct nfs4_state *state, + const struct nfs4_lock_state *lock + ), + + TP_ARGS(state, lock), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned long, state_flags) + __field(unsigned long, lock_flags) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->state_flags = state->flags; + __entry->lock_flags = lock->ls_flags; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x state_flags=%s lock_flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + show_nfs4_state_flags(__entry->state_flags), + show_nfs4_lock_flags(__entry->lock_flags) + ) +) + DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_PROTO( const struct inode *inode, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ab07db0f07cd..936c57779ff4 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1059,7 +1059,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve } static __be32 * -xdr_encode_nfstime4(__be32 *p, const struct timespec *t) +xdr_encode_nfstime4(__be32 *p, const struct timespec64 *t) { p = xdr_encode_hyper(p, (__s64)t->tv_sec); *p++ = cpu_to_be32(t->tv_nsec); @@ -1072,7 +1072,6 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server, const uint32_t attrmask[]) { - struct timespec ts; char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; int owner_namelen = 0; @@ -1161,16 +1160,14 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { if (iap->ia_valid & ATTR_ATIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - ts = timespec64_to_timespec(iap->ia_atime); - p = xdr_encode_nfstime4(p, &ts); + p = xdr_encode_nfstime4(p, &iap->ia_atime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { if (iap->ia_valid & ATTR_MTIME_SET) { *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); - ts = timespec64_to_timespec(iap->ia_mtime); - p = xdr_encode_nfstime4(p, &ts); + p = xdr_encode_nfstime4(p, &iap->ia_mtime); } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } @@ -4065,17 +4062,17 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint } static __be32 * -xdr_decode_nfstime4(__be32 *p, struct timespec *t) +xdr_decode_nfstime4(__be32 *p, struct timespec64 *t) { __u64 sec; p = xdr_decode_hyper(p, &sec); - t-> tv_sec = (time_t)sec; + t-> tv_sec = sec; t->tv_nsec = be32_to_cpup(p++); return p; } -static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) +static int decode_attr_time(struct xdr_stream *xdr, struct timespec64 *time) { __be32 *p; @@ -4086,7 +4083,7 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) return 0; } -static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4104,7 +4101,7 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str return status; } -static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -4123,7 +4120,7 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s } static int decode_attr_time_delta(struct xdr_stream *xdr, uint32_t *bitmap, - struct timespec *time) + struct timespec64 *time) { int status = 0; @@ -4186,7 +4183,7 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, return status; } -static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) +static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time) { int status = 0; @@ -7581,6 +7578,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(CLONE, enc_clone, dec_clone), PROC42(COPY, enc_copy, dec_copy), PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), + PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify), PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), }; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 361cc10d6f95..f64a33d2a1d1 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1065,6 +1065,39 @@ TRACE_EVENT(nfs_commit_done, ) ); +TRACE_EVENT(nfs_fh_to_dentry, + TP_PROTO( + const struct super_block *sb, + const struct nfs_fh *fh, + u64 fileid, + int error + ), + + TP_ARGS(sb, fh, fileid, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + ), + + TP_fast_assign( + __entry->error = error; + __entry->dev = sb->s_dev; + __entry->fileid = fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + ), + + TP_printk( + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x ", + __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle + ) +); + TRACE_DEFINE_ENUM(NFS_OK); TRACE_DEFINE_ENUM(NFSERR_PERM); TRACE_DEFINE_ENUM(NFSERR_NOENT); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bb80034a7661..cec3070ab577 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2160,8 +2160,6 @@ out_unlock: return NULL; } -extern const nfs4_stateid current_stateid; - static void _lgopen_prepare_attached(struct nfs4_opendata *data, struct nfs_open_context *ctx) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a84df7d63403..8d8d04bb9d64 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1592,7 +1592,7 @@ static int nfs_parse_mount_options(char *raw, dfprintk(MOUNT, "NFS: invalid " "lookupcache argument\n"); return 0; - }; + } break; case Opt_fscache_uniq: if (nfs_get_option_str(args, &mnt->fscache_uniq)) @@ -1625,7 +1625,7 @@ static int nfs_parse_mount_options(char *raw, dfprintk(MOUNT, "NFS: invalid " "local_lock argument\n"); return 0; - }; + } break; /* @@ -2585,7 +2585,7 @@ static void nfs_get_cache_cookie(struct super_block *sb, if (mnt_s->fscache_key) { uniq = mnt_s->fscache_key->key.uniquifier; ulen = mnt_s->fscache_key->key.uniq_len; - }; + } } else return; diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 4f3390b20239..c489496b5659 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -121,8 +121,7 @@ static void nfs_netns_client_release(struct kobject *kobj) struct nfs_netns_client, kobject); - if (c->identifier) - kfree(c->identifier); + kfree(c->identifier); kfree(c); } diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 10cefb0c07c7..f2f81561ebb6 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -73,7 +73,8 @@ config NFSD_V4 select NFSD_V3 select FS_POSIX_ACL select SUNRPC_GSS - select CRYPTO + select CRYPTO_MD5 + select CRYPTO_SHA256 select GRACE_PERIOD help This option enables support in your system's NFS server for diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ef55e9b1cd4e..32a9bf22ac08 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -685,8 +685,6 @@ nfsd_file_cache_purge(struct net *net) void nfsd_file_cache_shutdown(void) { - LIST_HEAD(dispose); - set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); lease_unregister_notifier(&nfsd_file_lease_notifier); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 86e5658651f1..195ab7a0fc89 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -863,13 +863,11 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, } else dchild = dget(dparent); } else - dchild = lookup_one_len_unlocked(name, dparent, namlen); + dchild = lookup_positive_unlocked(name, dparent, namlen); if (IS_ERR(dchild)) return rv; if (d_mountpoint(dchild)) goto out; - if (d_really_is_negative(dchild)) - goto out; if (dchild->d_inode->i_ino != ino) goto out; rv = fh_compose(fhp, exp, dchild, &cd->fh); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 524111420b48..24534db87e86 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -826,6 +826,31 @@ static int max_cb_time(struct net *net) return max(nn->nfsd4_lease/10, (time_t)1) * HZ; } +static struct workqueue_struct *callback_wq; + +static bool nfsd4_queue_cb(struct nfsd4_callback *cb) +{ + return queue_work(callback_wq, &cb->cb_work); +} + +static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) +{ + atomic_inc(&clp->cl_cb_inflight); +} + +static void nfsd41_cb_inflight_end(struct nfs4_client *clp) +{ + + if (atomic_dec_and_test(&clp->cl_cb_inflight)) + wake_up_var(&clp->cl_cb_inflight); +} + +static void nfsd41_cb_inflight_wait_complete(struct nfs4_client *clp) +{ + wait_var_event(&clp->cl_cb_inflight, + !atomic_read(&clp->cl_cb_inflight)); +} + static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses) { if (clp->cl_minorversion == 0) { @@ -937,14 +962,21 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) clp->cl_cb_state = NFSD4_CB_UP; } +static void nfsd4_cb_probe_release(void *calldata) +{ + struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); + + nfsd41_cb_inflight_end(clp); + +} + static const struct rpc_call_ops nfsd4_cb_probe_ops = { /* XXX: release method to ensure we set the cb channel down if * necessary on early failure? */ .rpc_call_done = nfsd4_cb_probe_done, + .rpc_release = nfsd4_cb_probe_release, }; -static struct workqueue_struct *callback_wq; - /* * Poke the callback thread to process any updates to the callback * parameters, and send a null probe. @@ -975,9 +1007,12 @@ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn) * If the slot is available, then mark it busy. Otherwise, set the * thread for sleeping on the callback RPC wait queue. */ -static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) +static bool nfsd41_cb_get_slot(struct nfsd4_callback *cb, struct rpc_task *task) { - if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { + struct nfs4_client *clp = cb->cb_clp; + + if (!cb->cb_holds_slot && + test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { rpc_sleep_on(&clp->cl_cb_waitq, task, NULL); /* Race breaker */ if (test_and_set_bit(0, &clp->cl_cb_slot_busy) != 0) { @@ -986,9 +1021,31 @@ static bool nfsd41_cb_get_slot(struct nfs4_client *clp, struct rpc_task *task) } rpc_wake_up_queued_task(&clp->cl_cb_waitq, task); } + cb->cb_holds_slot = true; return true; } +static void nfsd41_cb_release_slot(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + if (cb->cb_holds_slot) { + cb->cb_holds_slot = false; + clear_bit(0, &clp->cl_cb_slot_busy); + rpc_wake_up_next(&clp->cl_cb_waitq); + } +} + +static void nfsd41_destroy_cb(struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + nfsd41_cb_release_slot(cb); + if (cb->cb_ops && cb->cb_ops->release) + cb->cb_ops->release(cb); + nfsd41_cb_inflight_end(clp); +} + /* * TODO: cb_sequence should support referring call lists, cachethis, multiple * slots, and mark callback channel down on communication errors. @@ -1005,11 +1062,8 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) */ cb->cb_seq_status = 1; cb->cb_status = 0; - if (minorversion) { - if (!cb->cb_holds_slot && !nfsd41_cb_get_slot(clp, task)) - return; - cb->cb_holds_slot = true; - } + if (minorversion && !nfsd41_cb_get_slot(cb, task)) + return; rpc_call_start(task); } @@ -1072,13 +1126,12 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback } break; default: + nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); dprintk("%s: unprocessed error %d\n", __func__, cb->cb_seq_status); } - cb->cb_holds_slot = false; - clear_bit(0, &clp->cl_cb_slot_busy); - rpc_wake_up_next(&clp->cl_cb_waitq); + nfsd41_cb_release_slot(cb); dprintk("%s: freed slot, new seqid=%d\n", __func__, clp->cl_cb_session->se_cb_seq_nr); @@ -1091,8 +1144,10 @@ retry_nowait: ret = false; goto out; need_restart: - task->tk_status = 0; - cb->cb_need_restart = true; + if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + task->tk_status = 0; + cb->cb_need_restart = true; + } return false; } @@ -1134,9 +1189,9 @@ static void nfsd4_cb_release(void *calldata) struct nfsd4_callback *cb = calldata; if (cb->cb_need_restart) - nfsd4_run_cb(cb); + nfsd4_queue_cb(cb); else - cb->cb_ops->release(cb); + nfsd41_destroy_cb(cb); } @@ -1170,6 +1225,7 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) */ nfsd4_run_cb(&clp->cl_cb_null); flush_workqueue(callback_wq); + nfsd41_cb_inflight_wait_complete(clp); } /* requires cl_lock: */ @@ -1187,6 +1243,12 @@ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) return NULL; } +/* + * Note there isn't a lot of locking in this code; instead we depend on + * the fact that it is run from the callback_wq, which won't run two + * work items at once. So, for example, callback_wq handles all access + * of cl_cb_client and all calls to rpc_create or rpc_shutdown_client. + */ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) { struct nfs4_cb_conn conn; @@ -1255,8 +1317,7 @@ nfsd4_run_cb_work(struct work_struct *work) clnt = clp->cl_cb_client; if (!clnt) { /* Callback channel broken, or client killed; give up: */ - if (cb->cb_ops && cb->cb_ops->release) - cb->cb_ops->release(cb); + nfsd41_destroy_cb(cb); return; } @@ -1265,6 +1326,7 @@ nfsd4_run_cb_work(struct work_struct *work) */ if (!cb->cb_ops && clp->cl_minorversion) { clp->cl_cb_state = NFSD4_CB_UP; + nfsd41_destroy_cb(cb); return; } @@ -1290,5 +1352,9 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, void nfsd4_run_cb(struct nfsd4_callback *cb) { - queue_work(callback_wq, &cb->cb_work); + struct nfs4_client *clp = cb->cb_clp; + + nfsd41_cb_inflight_begin(clp); + if (!nfsd4_queue_cb(cb)) + nfsd41_cb_inflight_end(clp); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 4e3e77b76411..4798667af647 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1077,7 +1077,8 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos, - dst->nf_file, clone->cl_dst_pos, clone->cl_count); + dst->nf_file, clone->cl_dst_pos, clone->cl_count, + EX_ISSYNC(cstate->current_fh.fh_export)); nfsd_file_put(dst); nfsd_file_put(src); @@ -1297,7 +1298,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: return status; out_err: - cleanup_async_copy(async_copy); + if (async_copy) + cleanup_async_copy(async_copy); goto out; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index cdc75ad4438b..2481e7662128 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -1578,6 +1578,7 @@ nfsd4_cld_tracking_init(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool running; int retries = 10; + struct crypto_shash *tfm; status = nfs4_cld_state_init(net); if (status) @@ -1586,11 +1587,6 @@ nfsd4_cld_tracking_init(struct net *net) status = __nfsd4_init_cld_pipe(net); if (status) goto err_shutdown; - nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(nn->cld_net->cn_tfm)) { - status = PTR_ERR(nn->cld_net->cn_tfm); - goto err_remove; - } /* * rpc pipe upcalls take 30 seconds to time out, so we don't want to @@ -1607,6 +1603,12 @@ nfsd4_cld_tracking_init(struct net *net) status = -ETIMEDOUT; goto err_remove; } + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + status = PTR_ERR(tfm); + goto err_remove; + } + nn->cld_net->cn_tfm = tfm; status = nfsd4_cld_get_version(nn); if (status == -EOPNOTSUPP) @@ -1850,19 +1852,14 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) static char * bin_to_hex_dup(const unsigned char *src, int srclen) { - int i; - char *buf, *hex; + char *buf; /* +1 for terminating NULL */ - buf = kmalloc((srclen * 2) + 1, GFP_KERNEL); + buf = kzalloc((srclen * 2) + 1, GFP_KERNEL); if (!buf) return buf; - hex = buf; - for (i = 0; i < srclen; i++) { - sprintf(hex, "%2.2x", *src++); - hex += 2; - } + bin2hex(buf, src, srclen); return buf; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c65aeaa812d4..369e574c5092 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2382,10 +2382,10 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) access = bmap_to_share_mode(ols->st_access_bmap); deny = bmap_to_share_mode(ols->st_deny_bmap); - seq_printf(s, "access: \%s\%s, ", + seq_printf(s, "access: %s%s, ", access & NFS4_SHARE_ACCESS_READ ? "r" : "-", access & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); - seq_printf(s, "deny: \%s\%s, ", + seq_printf(s, "deny: %s%s, ", deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); @@ -3548,12 +3548,17 @@ static bool replay_matches_cache(struct svc_rqst *rqstp, (bool)seq->cachethis) return false; /* - * If there's an error than the reply can have fewer ops than - * the call. But if we cached a reply with *more* ops than the - * call you're sending us now, then this new call is clearly not - * really a replay of the old one: + * If there's an error then the reply can have fewer ops than + * the call. */ - if (slot->sl_opcnt < argp->opcnt) + if (slot->sl_opcnt < argp->opcnt && !slot->sl_status) + return false; + /* + * But if we cached a reply with *more* ops than the call you're + * sending us now, then this new call is clearly not really a + * replay of the old one: + */ + if (slot->sl_opcnt > argp->opcnt) return false; /* This is the only check explicitly called by spec: */ if (!same_creds(&rqstp->rq_cred, &slot->sl_cred)) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 533d0fc3c96b..d2dc4c0e22e8 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2991,18 +2991,9 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, __be32 nfserr; int ignore_crossmnt = 0; - dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen); + dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); - if (d_really_is_negative(dentry)) { - /* - * we're not holding the i_mutex here, so there's - * a window where this directory entry could have gone - * away. - */ - dput(dentry); - return nfserr_noent; - } exp_get(exp); /* @@ -3461,7 +3452,6 @@ static __be32 nfsd4_encode_splice_read( struct xdr_stream *xdr = &resp->xdr; struct xdr_buf *buf = xdr->buf; u32 eof; - long len; int space_left; __be32 nfserr; __be32 *p = xdr->p - 2; @@ -3470,7 +3460,6 @@ static __be32 nfsd4_encode_splice_read( if (xdr->end - xdr->p < 1) return nfserr_resource; - len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index af2947551e9c..57b93d95fa5c 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -280,7 +280,8 @@ void nfsd_lockd_shutdown(void); #define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP) #define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) #define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) -#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) +#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) +#define nfserr_file_open cpu_to_be32(NFS4ERR_FILE_OPEN) /* error codes for internal use */ /* if a request fails due to kmalloc failure, it gets dropped. diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index fdf7ed4bd5dd..e8bee8ff30c5 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -95,12 +95,11 @@ static const struct svc_version *nfsd_acl_version[] = { #define NFSD_ACL_MINVERS 2 #define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version) -static const struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS]; static struct svc_program nfsd_acl_program = { .pg_prog = NFS_ACL_PROGRAM, .pg_nvers = NFSD_ACL_NRVERS, - .pg_vers = nfsd_acl_versions, + .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", .pg_stats = &nfsd_acl_svcstats, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 46f56afb6cb8..d61b83b9654c 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -367,6 +367,7 @@ struct nfs4_client { struct net *net; struct list_head async_copies; /* list of async copies */ spinlock_t async_lock; /* lock for async copies */ + atomic_t cl_cb_inflight; /* Outstanding callbacks */ }; /* struct nfs4_client_reset diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index bd0a385df3fc..c0dc491537a6 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -525,7 +525,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, #endif __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, - u64 dst_pos, u64 count) + u64 dst_pos, u64 count, bool sync) { loff_t cloned; @@ -534,6 +534,12 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, return nfserrno(cloned); if (count && cloned != count) return nfserrno(-EINVAL); + if (sync) { + loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX; + int status = vfs_fsync_range(dst, dst_pos, dst_end, 0); + if (status < 0) + return nfserrno(status); + } return 0; } @@ -1809,7 +1815,17 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, out_drop_write: fh_drop_write(fhp); out_nfserr: - err = nfserrno(host_err); + if (host_err == -EBUSY) { + /* name is mounted-on. There is no perfect + * error status. + */ + if (nfsd_v4client(rqstp)) + err = nfserr_file_open; + else + err = nfserr_acces; + } else { + err = nfserrno(host_err); + } out: return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a13fd9d7e1f5..cc110a10bfe8 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -56,7 +56,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); __be32 nfsd4_clone_file_range(struct file *, u64, struct file *, - u64, u64); + u64, u64, bool); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 3e77b728a22b..46f225580009 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -57,6 +57,9 @@ static void fsnotify_unmount_inodes(struct super_block *sb) * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. + * However, we should have been called /after/ evict_inodes + * removed all zero refcount inodes, in any case. Test to + * be sure. */ if (!atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); @@ -77,6 +80,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb) iput_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index a5612abc0936..c740159d9ad1 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -46,8 +46,9 @@ static int flush_racache(struct inode *inode) * Post and wait for the I/O upcall to finish */ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, - loff_t *offset, struct iov_iter *iter, size_t total_size, - loff_t readahead_size, struct orangefs_write_range *wr, int *index_return) + loff_t *offset, struct iov_iter *iter, size_t total_size, + loff_t readahead_size, struct orangefs_write_range *wr, + int *index_return, struct file *file) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; @@ -55,6 +56,8 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, int buffer_index; ssize_t ret; size_t copy_amount; + int open_for_read; + int open_for_write; new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); if (!new_op) @@ -90,6 +93,38 @@ populate_shared_memory: new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); } + /* + * Orangefs has no open, and orangefs checks file permissions + * on each file access. Posix requires that file permissions + * be checked on open and nowhere else. Orangefs-through-the-kernel + * needs to seem posix compliant. + * + * The VFS opens files, even if the filesystem provides no + * method. We can see if a file was successfully opened for + * read and or for write by looking at file->f_mode. + * + * When writes are flowing from the page cache, file is no + * longer available. We can trust the VFS to have checked + * file->f_mode before writing to the page cache. + * + * The mode of a file might change between when it is opened + * and IO commences, or it might be created with an arbitrary mode. + * + * We'll make sure we don't hit EACCES during the IO stage by + * using UID 0. Some of the time we have access without changing + * to UID 0 - how to check? + */ + if (file) { + open_for_write = file->f_mode & FMODE_WRITE; + open_for_read = file->f_mode & FMODE_READ; + } else { + open_for_write = 1; + open_for_read = 0; /* not relevant? */ + } + if ((type == ORANGEFS_IO_WRITE) && open_for_write) + new_op->upcall.uid = 0; + if ((type == ORANGEFS_IO_READ) && open_for_read) + new_op->upcall.uid = 0; gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): offset: %llu total_size: %zd\n", diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index efb12197da18..961c0fd8675a 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -55,7 +55,7 @@ static int orangefs_writepage_locked(struct page *page, iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, - len, wr, NULL); + len, wr, NULL, NULL); if (ret < 0) { SetPageError(page); mapping_set_error(page->mapping, ret); @@ -126,7 +126,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, wr.uid = ow->uid; wr.gid = ow->gid; ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, - 0, &wr, NULL); + 0, &wr, NULL, NULL); if (ret < 0) { for (i = 0; i < ow->npages; i++) { SetPageError(ow->pages[i]); @@ -311,7 +311,7 @@ static int orangefs_readpage(struct file *file, struct page *page) iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, - read_size, inode->i_size, NULL, &buffer_index); + read_size, inode->i_size, NULL, &buffer_index, file); remaining = ret; /* this will only zero remaining unread portions of the page data */ iov_iter_zero(~0U, &iter); @@ -651,7 +651,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, (int)*offset); ret = wait_for_direct_io(type, inode, offset, iter, - each_count, 0, NULL, NULL); + each_count, 0, NULL, NULL, file); gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): return from wait_for_io:%d\n", __func__, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 34a6c99fa29b..ed67f39fa7ce 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -398,7 +398,8 @@ bool __is_daemon_in_service(void); */ int orangefs_revalidate_mapping(struct inode *); ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, - struct iov_iter *, size_t, loff_t, struct orangefs_write_range *, int *); + struct iov_iter *, size_t, loff_t, struct orangefs_write_range *, int *, + struct file *); ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, struct iov_iter *); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index b801c6353100..6220642fe113 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -227,13 +227,17 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) { struct ovl_fh *fh; - int fh_type, fh_len, dwords; - void *buf; + int fh_type, dwords; int buflen = MAX_HANDLE_SZ; uuid_t *uuid = &real->d_sb->s_uuid; + int err; - buf = kmalloc(buflen, GFP_KERNEL); - if (!buf) + /* Make sure the real fid stays 32bit aligned */ + BUILD_BUG_ON(OVL_FH_FID_OFFSET % 4); + BUILD_BUG_ON(MAX_HANDLE_SZ + OVL_FH_FID_OFFSET > 255); + + fh = kzalloc(buflen + OVL_FH_FID_OFFSET, GFP_KERNEL); + if (!fh) return ERR_PTR(-ENOMEM); /* @@ -242,27 +246,19 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * the price or reconnecting the dentry. */ dwords = buflen >> 2; - fh_type = exportfs_encode_fh(real, buf, &dwords, 0); + fh_type = exportfs_encode_fh(real, (void *)fh->fb.fid, &dwords, 0); buflen = (dwords << 2); - fh = ERR_PTR(-EIO); + err = -EIO; if (WARN_ON(fh_type < 0) || WARN_ON(buflen > MAX_HANDLE_SZ) || WARN_ON(fh_type == FILEID_INVALID)) - goto out; + goto out_err; - BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); - fh_len = offsetof(struct ovl_fh, fid) + buflen; - fh = kmalloc(fh_len, GFP_KERNEL); - if (!fh) { - fh = ERR_PTR(-ENOMEM); - goto out; - } - - fh->version = OVL_FH_VERSION; - fh->magic = OVL_FH_MAGIC; - fh->type = fh_type; - fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + fh->fb.version = OVL_FH_VERSION; + fh->fb.magic = OVL_FH_MAGIC; + fh->fb.type = fh_type; + fh->fb.flags = OVL_FH_FLAG_CPU_ENDIAN; /* * When we will want to decode an overlay dentry from this handle * and all layers are on the same fs, if we get a disconncted real @@ -270,14 +266,15 @@ struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper) * it to upperdentry or to lowerstack is by checking this flag. */ if (is_upper) - fh->flags |= OVL_FH_FLAG_PATH_UPPER; - fh->len = fh_len; - fh->uuid = *uuid; - memcpy(fh->fid, buf, buflen); + fh->fb.flags |= OVL_FH_FLAG_PATH_UPPER; + fh->fb.len = sizeof(fh->fb) + buflen; + fh->fb.uuid = *uuid; -out: - kfree(buf); return fh; + +out_err: + kfree(fh); + return ERR_PTR(err); } int ovl_set_origin(struct dentry *dentry, struct dentry *lower, @@ -300,8 +297,8 @@ int ovl_set_origin(struct dentry *dentry, struct dentry *lower, /* * Do not fail when upper doesn't support xattrs. */ - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh, - fh ? fh->len : 0, 0); + err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf, + fh ? fh->fb.len : 0, 0); kfree(fh); return err; @@ -317,7 +314,7 @@ static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index) if (IS_ERR(fh)) return PTR_ERR(fh); - err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0); + err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh->buf, fh->fb.len, 0); kfree(fh); return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 702aa63f6774..29abdb1d3b5c 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -1170,7 +1170,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, if (newdentry == trap) goto out_dput; - if (WARN_ON(olddentry->d_inode == newdentry->d_inode)) + if (olddentry->d_inode == newdentry->d_inode) goto out_dput; err = 0; diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 73c9775215b3..70e55588aedc 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -211,10 +211,11 @@ static int ovl_check_encode_origin(struct dentry *dentry) return 1; } -static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) +static int ovl_dentry_to_fid(struct dentry *dentry, u32 *fid, int buflen) { struct ovl_fh *fh = NULL; int err, enc_lower; + int len; /* * Check if we should encode a lower or upper file handle and maybe @@ -231,11 +232,12 @@ static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) return PTR_ERR(fh); err = -EOVERFLOW; - if (fh->len > buflen) + len = OVL_FH_LEN(fh); + if (len > buflen) goto fail; - memcpy(buf, (char *)fh, fh->len); - err = fh->len; + memcpy(fid, fh, len); + err = len; out: kfree(fh); @@ -243,31 +245,16 @@ out: fail: pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n", - dentry, err, buflen, fh ? (int)fh->len : 0, - fh ? fh->type : 0); + dentry, err, buflen, fh ? (int)fh->fb.len : 0, + fh ? fh->fb.type : 0); goto out; } -static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len) -{ - int res, len = *max_len << 2; - - res = ovl_d_to_fh(dentry, (char *)fid, len); - if (res <= 0) - return FILEID_INVALID; - - len = res; - - /* Round up to dwords */ - *max_len = (len + 3) >> 2; - return OVL_FILEID; -} - static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, struct inode *parent) { struct dentry *dentry; - int type; + int bytes = *max_len << 2; /* TODO: encode connectable file handles */ if (parent) @@ -277,10 +264,14 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len, if (WARN_ON(!dentry)) return FILEID_INVALID; - type = ovl_dentry_to_fh(dentry, fid, max_len); - + bytes = ovl_dentry_to_fid(dentry, fid, bytes); dput(dentry); - return type; + if (bytes <= 0) + return FILEID_INVALID; + + *max_len = bytes >> 2; + + return OVL_FILEID_V1; } /* @@ -777,24 +768,45 @@ out_err: goto out; } +static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) +{ + struct ovl_fh *fh; + + /* If on-wire inner fid is aligned - nothing to do */ + if (fh_type == OVL_FILEID_V1) + return (struct ovl_fh *)fid; + + if (fh_type != OVL_FILEID_V0) + return ERR_PTR(-EINVAL); + + fh = kzalloc(buflen, GFP_KERNEL); + if (!fh) + return ERR_PTR(-ENOMEM); + + /* Copy unaligned inner fh into aligned buffer */ + memcpy(&fh->fb, fid, buflen - OVL_FH_WIRE_OFFSET); + return fh; +} + static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct dentry *dentry = NULL; - struct ovl_fh *fh = (struct ovl_fh *) fid; + struct ovl_fh *fh = NULL; int len = fh_len << 2; unsigned int flags = 0; int err; - err = -EINVAL; - if (fh_type != OVL_FILEID) + fh = ovl_fid_to_fh(fid, len, fh_type); + err = PTR_ERR(fh); + if (IS_ERR(fh)) goto out_err; err = ovl_check_fh_len(fh, len); if (err) goto out_err; - flags = fh->flags; + flags = fh->fb.flags; dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ? ovl_upper_fh_to_d(sb, fh) : ovl_lower_fh_to_d(sb, fh); @@ -802,12 +814,18 @@ static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid, if (IS_ERR(dentry) && err != -ESTALE) goto out_err; +out: + /* We may have needed to re-align OVL_FILEID_V0 */ + if (!IS_ERR_OR_NULL(fh) && fh != (void *)fid) + kfree(fh); + return dentry; out_err: pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n", - len, fh_type, flags, err); - return ERR_PTR(err); + fh_len, fh_type, flags, err); + dentry = ERR_PTR(err); + goto out; } static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index bc14781886bf..b045cf1826fc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -200,8 +200,14 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || (!ovl_verify_lower(dentry->d_sb) && (is_dir || lowerstat.nlink == 1))) { - stat->ino = lowerstat.ino; lower_layer = ovl_layer_lower(dentry); + /* + * Cannot use origin st_dev;st_ino because + * origin inode content may differ from overlay + * inode content. + */ + if (samefs || lower_layer->fsid) + stat->ino = lowerstat.ino; } /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index e9717c2f7d45..76ff66339173 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -84,21 +84,21 @@ static int ovl_acceptable(void *ctx, struct dentry *dentry) * Return -ENODATA for "origin unknown". * Return <0 for an invalid file handle. */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) { - if (fh_len < sizeof(struct ovl_fh) || fh_len < fh->len) + if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) return -EINVAL; - if (fh->magic != OVL_FH_MAGIC) + if (fb->magic != OVL_FH_MAGIC) return -EINVAL; /* Treat larger version and unknown flags as "origin unknown" */ - if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) return -ENODATA; /* Treat endianness mismatch as "origin unknown" */ - if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && - (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) return -ENODATA; return 0; @@ -119,15 +119,15 @@ static struct ovl_fh *ovl_get_fh(struct dentry *dentry, const char *name) if (res == 0) return NULL; - fh = kzalloc(res, GFP_KERNEL); + fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); - res = vfs_getxattr(dentry, name, fh, res); + res = vfs_getxattr(dentry, name, fh->buf, res); if (res < 0) goto fail; - err = ovl_check_fh_len(fh, res); + err = ovl_check_fb_len(&fh->fb, res); if (err < 0) { if (err == -ENODATA) goto out; @@ -158,12 +158,12 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. */ - if (!uuid_equal(&fh->uuid, &mnt->mnt_sb->s_uuid)) + if (!uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid)) return NULL; - bytes = (fh->len - offsetof(struct ovl_fh, fid)); - real = exportfs_decode_fh(mnt, (struct fid *)fh->fid, - bytes >> 2, (int)fh->type, + bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); + real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, + bytes >> 2, (int)fh->fb.type, connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* @@ -173,7 +173,7 @@ struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, * index entries correctly. */ if (real == ERR_PTR(-ESTALE) && - !(fh->flags & OVL_FH_FLAG_PATH_UPPER)) + !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) real = NULL; return real; } @@ -200,7 +200,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, int err; bool last_element = !post[0]; - this = lookup_one_len_unlocked(name, base, namelen); + this = lookup_positive_unlocked(name, base, namelen); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; @@ -208,8 +208,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto out; goto out_err; } - if (!this->d_inode) - goto put_and_out; if (ovl_dentry_weird(this)) { /* Don't support traversing automounts and other weirdness */ @@ -325,6 +323,14 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, int i; for (i = 0; i < ofs->numlower; i++) { + /* + * If lower fs uuid is not unique among lower fs we cannot match + * fh->uuid to layer. + */ + if (ofs->lower_layers[i].fsid && + ofs->lower_layers[i].fs->bad_uuid) + continue; + origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt, connected); if (origin) @@ -402,7 +408,7 @@ static int ovl_verify_fh(struct dentry *dentry, const char *name, if (IS_ERR(ofh)) return PTR_ERR(ofh); - if (fh->len != ofh->len || memcmp(fh, ofh, fh->len)) + if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) err = -ESTALE; kfree(ofh); @@ -433,7 +439,7 @@ int ovl_verify_set_fh(struct dentry *dentry, const char *name, err = ovl_verify_fh(dentry, name, fh); if (set && err == -ENODATA) - err = ovl_do_setxattr(dentry, name, fh, fh->len, 0); + err = ovl_do_setxattr(dentry, name, fh->buf, fh->fb.len, 0); if (err) goto fail; @@ -507,20 +513,20 @@ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) goto fail; err = -EINVAL; - if (index->d_name.len < sizeof(struct ovl_fh)*2) + if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; err = -ENOMEM; len = index->d_name.len / 2; - fh = kzalloc(len, GFP_KERNEL); + fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) goto fail; err = -EINVAL; - if (hex2bin((u8 *)fh, index->d_name.name, len)) + if (hex2bin(fh->buf, index->d_name.name, len)) goto fail; - err = ovl_check_fh_len(fh, len); + err = ovl_check_fb_len(&fh->fb, len); if (err) goto fail; @@ -599,11 +605,11 @@ static int ovl_get_index_name_fh(struct ovl_fh *fh, struct qstr *name) { char *n, *s; - n = kcalloc(fh->len, 2, GFP_KERNEL); + n = kcalloc(fh->fb.len, 2, GFP_KERNEL); if (!n) return -ENOMEM; - s = bin2hex(n, fh, fh->len); + s = bin2hex(n, fh->buf, fh->fb.len); *name = (struct qstr) QSTR_INIT(n, s - n); return 0; @@ -651,7 +657,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) if (err) return ERR_PTR(err); - index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) @@ -659,9 +665,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) return index; } - if (d_is_negative(index)) - err = 0; - else if (ovl_is_whiteout(index)) + if (ovl_is_whiteout(index)) err = -ESTALE; else if (ovl_dentry_weird(index)) err = -EIO; @@ -685,7 +689,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); + index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { @@ -700,9 +704,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, } inode = d_inode(index); - if (d_is_negative(index)) { - goto out_dput; - } else if (ovl_is_whiteout(index) && !verify) { + if (ovl_is_whiteout(index) && !verify) { /* * When index lookup is called with !verify for decoding an * overlay file handle, a whiteout index implies that decode @@ -1131,7 +1133,7 @@ bool ovl_lower_positive(struct dentry *dentry) struct dentry *this; struct dentry *lowerdir = poe->lowerstack[i].dentry; - this = lookup_one_len_unlocked(name->name, lowerdir, + this = lookup_positive_unlocked(name->name, lowerdir, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { @@ -1148,10 +1150,8 @@ bool ovl_lower_positive(struct dentry *dentry) break; } } else { - if (this->d_inode) { - positive = !ovl_is_whiteout(this); - done = true; - } + positive = !ovl_is_whiteout(this); + done = true; dput(this); } } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6934bcf030f0..f283b1d69a9e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -71,20 +71,36 @@ enum ovl_entry_flag { #error Endianness not defined #endif -/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */ -#define OVL_FILEID 0xfb +/* The type used to be returned by overlay exportfs for misaligned fid */ +#define OVL_FILEID_V0 0xfb +/* The type returned by overlay exportfs for 32bit aligned fid */ +#define OVL_FILEID_V1 0xf8 -/* On-disk and in-memeory format for redirect by file handle */ -struct ovl_fh { +/* On-disk format for "origin" file handle */ +struct ovl_fb { u8 version; /* 0 */ u8 magic; /* 0xfb */ u8 len; /* size of this header + size of fid */ u8 flags; /* OVL_FH_FLAG_* */ u8 type; /* fid_type of fid */ uuid_t uuid; /* uuid of filesystem */ - u8 fid[0]; /* file identifier */ + u32 fid[0]; /* file identifier should be 32bit aligned in-memory */ } __packed; +/* In-memory and on-wire format for overlay file handle */ +struct ovl_fh { + u8 padding[3]; /* make sure fb.fid is 32bit aligned */ + union { + struct ovl_fb fb; + u8 buf[0]; + }; +} __packed; + +#define OVL_FH_WIRE_OFFSET offsetof(struct ovl_fh, fb) +#define OVL_FH_LEN(fh) (OVL_FH_WIRE_OFFSET + (fh)->fb.len) +#define OVL_FH_FID_OFFSET (OVL_FH_WIRE_OFFSET + \ + offsetof(struct ovl_fb, fid)) + static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(dir, dentry); @@ -302,7 +318,13 @@ static inline void ovl_inode_unlock(struct inode *inode) /* namei.c */ -int ovl_check_fh_len(struct ovl_fh *fh, int fh_len); +int ovl_check_fb_len(struct ovl_fb *fb, int fb_len); + +static inline int ovl_check_fh_len(struct ovl_fh *fh, int fh_len) +{ + return ovl_check_fb_len(&fh->fb, fh_len - OVL_FH_WIRE_OFFSET); +} + struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt, bool connected); int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index a8279280e88d..28348c44ea5b 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -22,6 +22,8 @@ struct ovl_config { struct ovl_sb { struct super_block *sb; dev_t pseudo_dev; + /* Unusable (conflicting) uuid */ + bool bad_uuid; }; struct ovl_layer { diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index afbcb116a7f1..7621ff176d15 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1255,7 +1255,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) { unsigned int i; - if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt)) + if (!ofs->config.nfs_export && !ofs->upper_mnt) return true; for (i = 0; i < ofs->numlowerfs; i++) { @@ -1263,9 +1263,13 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * We use uuid to associate an overlay lower file handle with a * lower layer, so we can accept lower fs with null uuid as long * as all lower layers with null uuid are on the same fs. + * if we detect multiple lower fs with the same uuid, we + * disable lower file handle decoding on all of them. */ - if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) + if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) { + ofs->lower_fs[i].bad_uuid = true; return false; + } } return true; } @@ -1277,6 +1281,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) unsigned int i; dev_t dev; int err; + bool bad_uuid = false; /* fsid 0 is reserved for upper fs even with non upper overlay */ if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb) @@ -1288,11 +1293,15 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) } if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) { - ofs->config.index = false; - ofs->config.nfs_export = false; - pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", - uuid_is_null(&sb->s_uuid) ? "null" : "conflicting", - path->dentry); + bad_uuid = true; + if (ofs->config.index || ofs->config.nfs_export) { + ofs->config.index = false; + ofs->config.nfs_export = false; + pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n", + uuid_is_null(&sb->s_uuid) ? "null" : + "conflicting", + path->dentry); + } } err = get_anon_bdev(&dev); @@ -1303,6 +1312,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path) ofs->lower_fs[ofs->numlowerfs].sb = sb; ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev; + ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid; ofs->numlowerfs++; return ofs->numlowerfs; diff --git a/fs/pipe.c b/fs/pipe.c index 648ce440ca85..57502c3c0fba 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -270,22 +270,41 @@ static bool pipe_buf_can_merge(struct pipe_buffer *buf) return buf->ops == &anon_pipe_buf_ops; } +/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ +static inline bool pipe_readable(const struct pipe_inode_info *pipe) +{ + unsigned int head = READ_ONCE(pipe->head); + unsigned int tail = READ_ONCE(pipe->tail); + unsigned int writers = READ_ONCE(pipe->writers); + + return !pipe_empty(head, tail) || !writers; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - int do_wakeup; + bool was_full; ssize_t ret; /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; - do_wakeup = 0; ret = 0; __pipe_lock(pipe); + + /* + * We only wake up writers if the pipe was full when we started + * reading in order to avoid unnecessary wakeups. + * + * But when we do wake up writers, we do so using a sync wakeup + * (WF_SYNC), because we want them to get going and generate more + * data for us. + */ + was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; @@ -324,19 +343,11 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) } if (!buf->len) { - bool wake; pipe_buf_release(pipe, buf); spin_lock_irq(&pipe->wait.lock); tail++; pipe->tail = tail; - do_wakeup = 1; - wake = head - (tail - 1) == pipe->max_usage / 2; - if (wake) - wake_up_locked_poll( - &pipe->wait, EPOLLOUT | EPOLLWRNORM); spin_unlock_irq(&pipe->wait.lock); - if (wake) - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } total_len -= chars; if (!total_len) @@ -347,31 +358,52 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (!pipe->writers) break; - if (!pipe->waiting_writers) { - /* syscall merging: Usually we must not sleep - * if O_NONBLOCK is set, or if we got some data. - * But if a writer sleeps in kernel space, then - * we can wait for that data without violating POSIX. - */ - if (ret) - break; - if (filp->f_flags & O_NONBLOCK) { - ret = -EAGAIN; - break; - } - } - if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; + if (ret) + break; + if (filp->f_flags & O_NONBLOCK) { + ret = -EAGAIN; break; } - pipe_wait(pipe); + __pipe_unlock(pipe); + + /* + * We only get here if we didn't actually read anything. + * + * However, we could have seen (and removed) a zero-sized + * pipe buffer, and might have made space in the buffers + * that way. + * + * You can't make zero-sized pipe buffers by doing an empty + * write (not even in packet mode), but they can happen if + * the writer gets an EFAULT when trying to fill a buffer + * that already got allocated and inserted in the buffer + * array. + * + * So we still need to wake up any pending writers in the + * _very_ unlikely case that the pipe was full, but we got + * no data. + */ + if (unlikely(was_full)) { + wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + + /* + * But because we didn't read anything, at this point we can + * just return directly with -ERESTARTSYS if we're interrupted, + * since we've done any required wakeups and there's no need + * to mark anything accessed. And we've dropped the lock. + */ + if (wait_event_interruptible(pipe->wait, pipe_readable(pipe)) < 0) + return -ERESTARTSYS; + + __pipe_lock(pipe); + was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); } __pipe_unlock(pipe); - /* Signal writers asynchronously that there is more room. */ - if (do_wakeup) { - wake_up_interruptible_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); + if (was_full) { + wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) @@ -384,16 +416,27 @@ static inline int is_packetized(struct file *file) return (file->f_flags & O_DIRECT) != 0; } +/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ +static inline bool pipe_writable(const struct pipe_inode_info *pipe) +{ + unsigned int head = READ_ONCE(pipe->head); + unsigned int tail = READ_ONCE(pipe->tail); + unsigned int max_usage = READ_ONCE(pipe->max_usage); + + return !pipe_full(head, tail, max_usage) || + !READ_ONCE(pipe->readers); +} + static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - unsigned int head, max_usage, mask; + unsigned int head; ssize_t ret = 0; - int do_wakeup = 0; size_t total_len = iov_iter_count(from); ssize_t chars; + bool was_empty = false; /* Null write succeeds. */ if (unlikely(total_len == 0)) @@ -407,13 +450,22 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } + /* + * Only wake up if the pipe started out empty, since + * otherwise there should be no readers waiting. + * + * If it wasn't empty we try to merge new data into + * the last buffer. + * + * That naturally merges small writes, but it also + * page-aligs the rest of the writes for large writes + * spanning multiple pages. + */ head = pipe->head; - max_usage = pipe->max_usage; - mask = pipe->ring_size - 1; - - /* We try to merge small writes */ - chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ - if (!pipe_empty(head, pipe->tail) && chars != 0) { + was_empty = pipe_empty(head, pipe->tail); + chars = total_len & (PAGE_SIZE-1); + if (chars && !was_empty) { + unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; @@ -427,7 +479,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) ret = -EFAULT; goto out; } - do_wakeup = 1; + buf->len += ret; if (!iov_iter_count(from)) goto out; @@ -443,7 +495,8 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) } head = pipe->head; - if (!pipe_full(head, pipe->tail, max_usage)) { + if (!pipe_full(head, pipe->tail, pipe->max_usage)) { + unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[head & mask]; struct page *page = pipe->tmp_page; int copied; @@ -465,23 +518,13 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) spin_lock_irq(&pipe->wait.lock); head = pipe->head; - if (pipe_full(head, pipe->tail, max_usage)) { + if (pipe_full(head, pipe->tail, pipe->max_usage)) { spin_unlock_irq(&pipe->wait.lock); continue; } pipe->head = head + 1; - - /* Always wake up, even if the copy fails. Otherwise - * we lock up (O_NONBLOCK-)readers that sleep due to - * syscall merging. - * FIXME! Is this really true? - */ - wake_up_locked_poll( - &pipe->wait, EPOLLIN | EPOLLRDNORM); - spin_unlock_irq(&pipe->wait.lock); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; @@ -510,7 +553,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) break; } - if (!pipe_full(head, pipe->tail, max_usage)) + if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; /* Wait for buffer space to become available. */ @@ -524,14 +567,36 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) ret = -ERESTARTSYS; break; } - pipe->waiting_writers++; - pipe_wait(pipe); - pipe->waiting_writers--; + + /* + * We're going to release the pipe lock and wait for more + * space. We wake up any readers if necessary, and then + * after waiting we need to re-check whether the pipe + * become empty while we dropped the lock. + */ + __pipe_unlock(pipe); + if (was_empty) { + wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + wait_event_interruptible(pipe->wait, pipe_writable(pipe)); + __pipe_lock(pipe); + was_empty = pipe_empty(pipe->head, pipe->tail); } out: __pipe_unlock(pipe); - if (do_wakeup) { - wake_up_interruptible_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); + + /* + * If we do do a wakeup event, we do a 'sync' wakeup, because we + * want the reader to start processing things asap, rather than + * leave the data pending. + * + * This is particularly important for small writes, because of + * how (for example) the GNU make jobserver uses small writes to + * wake up pending jobs + */ + if (was_empty) { + wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { @@ -574,14 +639,24 @@ pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); + unsigned int head, tail; + /* + * Reading only -- no need for acquiring the semaphore. + * + * But because this is racy, the code has to add the + * entry to the poll table _first_ .. + */ poll_wait(filp, &pipe->wait, wait); - BUG_ON(pipe_occupancy(head, tail) > pipe->ring_size); + /* + * .. and only then can you do the racy tests. That way, + * if something changes and you got it wrong, the poll + * table entry will wake you up and fix it. + */ + head = READ_ONCE(pipe->head); + tail = READ_ONCE(pipe->tail); - /* Reading only -- no need for acquiring the semaphore. */ mask = 0; if (filp->f_mode & FMODE_READ) { if (!pipe_empty(head, tail)) @@ -1176,6 +1251,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; + wake_up_interruptible_all(&pipe->wait); return pipe->max_usage * PAGE_SIZE; out_revert_acct: diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index cb5629bd5fff..733881a6387b 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -42,8 +42,8 @@ config PROC_VMCORE bool "/proc/vmcore support" depends on PROC_FS && CRASH_DUMP default y - help - Exports the dump image of crashed kernel in ELF format. + help + Exports the dump image of crashed kernel in ELF format. config PROC_VMCORE_DEVICE_DUMP bool "Device Hardware/Firmware Log Collection" @@ -72,7 +72,7 @@ config PROC_SYSCTL a recompile of the kernel or reboot of the system. The primary interface is through /proc/sys. If you say Y here a tree of modifiable sysctl entries will be generated beneath the - /proc/sys directory. They are explained in the files + /proc/sys directory. They are explained in the files in <file:Documentation/admin-guide/sysctl/>. Note that enabling this option will enlarge the kernel by at least 8 KB. @@ -88,7 +88,7 @@ config PROC_PAGE_MONITOR Various /proc files exist to monitor process memory utilization: /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, /proc/kpagecount, and /proc/kpageflags. Disabling these - interfaces will reduce the size of the kernel by approximately 4kb. + interfaces will reduce the size of the kernel by approximately 4kb. config PROC_CHILDREN bool "Include /proc/<pid>/task/<tid>/children file" diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 64e9ee1b129e..074e9585c699 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -138,8 +138,12 @@ static int proc_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = d_inode(path->dentry); struct proc_dir_entry *de = PDE(inode); - if (de && de->nlink) - set_nlink(inode, de->nlink); + if (de) { + nlink_t nlink = READ_ONCE(de->nlink); + if (nlink > 0) { + set_nlink(inode, nlink); + } + } generic_fillattr(inode, stat); return 0; @@ -159,7 +163,6 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, { const char *cp = name, *next; struct proc_dir_entry *de; - unsigned int len; de = *ret; if (!de) @@ -170,13 +173,12 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, if (!next) break; - len = next - cp; - de = pde_subdir_find(de, cp, len); + de = pde_subdir_find(de, cp, next - cp); if (!de) { WARN(1, "name '%s'\n", name); return -ENOENT; } - cp += len + 1; + cp = next + 1; } *residual = cp; *ret = de; @@ -362,6 +364,7 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, write_unlock(&proc_subdir_lock); goto out_free_inum; } + dir->nlink++; write_unlock(&proc_subdir_lock); return dp; @@ -472,10 +475,7 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, ent->data = data; ent->proc_fops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; - parent->nlink++; ent = proc_register(parent, ent); - if (!ent) - parent->nlink--; } return ent; } @@ -505,10 +505,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->data = NULL; ent->proc_fops = NULL; ent->proc_iops = NULL; - parent->nlink++; ent = proc_register(parent, ent); - if (!ent) - parent->nlink--; } return ent; } @@ -666,8 +663,12 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) len = strlen(fn); de = pde_subdir_find(parent, fn, len); - if (de) + if (de) { rb_erase(&de->subdir_node, &parent->subdir); + if (S_ISDIR(de->mode)) { + parent->nlink--; + } + } write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -676,9 +677,6 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) proc_entry_rundown(de); - if (S_ISDIR(de->mode)) - parent->nlink--; - de->nlink = 0; WARN(pde_subdir_first(de), "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", __func__, de->parent->name, de->name, pde_subdir_first(de)->name); @@ -714,13 +712,12 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) de = next; continue; } - write_unlock(&proc_subdir_lock); - - proc_entry_rundown(de); next = de->parent; if (S_ISDIR(de->mode)) next->nlink--; - de->nlink = 0; + write_unlock(&proc_subdir_lock); + + proc_entry_rundown(de); if (de == root) break; pde_put(de); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index cd0c8d5ce9a1..0f3b557c9b77 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -197,8 +197,8 @@ extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, lof * inode.c */ struct pde_opener { - struct file *file; struct list_head lh; + struct file *file; bool closing; struct completion *c; } __randomize_layout; diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 37bdbec5b402..fd931d3e77be 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -134,7 +134,7 @@ static int show_stat(struct seq_file *p, void *v) softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; - guest_nice += cpustat[CPUTIME_USER]; + guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); @@ -175,7 +175,7 @@ static int show_stat(struct seq_file *p, void *v) softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; - guest_nice = cpustat[CPUTIME_USER]; + guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 4639d53e96a3..b6a4f692d345 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -984,6 +984,7 @@ static int add_dquot_ref(struct super_block *sb, int type) * later. */ old_inode = inode; + cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); @@ -2487,21 +2488,15 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name)); + dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name)); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (d_really_is_negative(dentry)) { - error = -ENOENT; - goto out; - } - error = security_quota_on(dentry); if (!error) error = dquot_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); -out: dput(dentry); return error; } diff --git a/fs/splice.c b/fs/splice.c index f2400ce7d528..3009652a41c8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -495,7 +495,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des unsigned int mask = pipe->ring_size - 1; int ret; - while (!pipe_empty(tail, head)) { + while (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; sd->len = buf->len; @@ -559,7 +559,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des if (!pipe->writers) return 0; - if (!pipe->waiting_writers && sd->num_spliced) + if (sd->num_spliced) return 0; if (sd->flags & SPLICE_F_NONBLOCK) @@ -711,9 +711,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, splice_from_pipe_begin(&sd); while (sd.total_len) { struct iov_iter from; - unsigned int head = pipe->head; - unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; + unsigned int head, tail, mask; size_t left; int n; @@ -732,6 +730,10 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } } + head = pipe->head; + tail = pipe->tail; + mask = pipe->ring_size - 1; + /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) { @@ -1096,9 +1098,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) return -EAGAIN; if (signal_pending(current)) return -ERESTARTSYS; - pipe->waiting_writers++; pipe_wait(pipe); - pipe->waiting_writers--; } } @@ -1480,11 +1480,9 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) } if (!pipe->writers) break; - if (!pipe->waiting_writers) { - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; - break; - } + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; } pipe_wait(pipe); } @@ -1525,9 +1523,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) ret = -ERESTARTSYS; break; } - pipe->waiting_writers++; pipe_wait(pipe); - pipe->waiting_writers--; } pipe_unlock(pipe); @@ -1749,13 +1745,6 @@ static int link_pipe(struct pipe_inode_info *ipipe, i_tail++; } while (len); - /* - * return EAGAIN if we have the potential of some data in the - * future, otherwise just return 0 - */ - if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) - ret = -EAGAIN; - pipe_unlock(ipipe); pipe_unlock(opipe); diff --git a/fs/super.c b/fs/super.c index cfadab2cbf35..cd352530eca9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -448,10 +448,12 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; - fsnotify_sb_delete(sb); cgroup_writeback_umount(); + /* evict all inodes with zero refcount */ evict_inodes(sb); + /* only nonzero refcount inodes can have marks */ + fsnotify_sb_delete(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); diff --git a/fs/verity/enable.c b/fs/verity/enable.c index eabc6ac19906..b79e3fd19d11 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -315,7 +315,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) if (arg.block_size != PAGE_SIZE) return -EINVAL; - if (arg.salt_size > FIELD_SIZEOF(struct fsverity_descriptor, salt)) + if (arg.salt_size > sizeof_field(struct fsverity_descriptor, salt)) return -EMSGSIZE; if (arg.sig_size > FS_VERITY_MAX_SIGNATURE_SIZE) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index c284e10af491..fc93fd88ec89 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2248,24 +2248,32 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +/* + * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, + * return the largest possible minimum length. + */ unsigned int xfs_alloc_min_freelist( struct xfs_mount *mp, struct xfs_perag *pag) { + /* AG btrees have at least 1 level. */ + static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; + const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; + ASSERT(mp->m_ag_maxlevels > 0); + /* space needed by-bno freespace btree */ - min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, mp->m_ag_maxlevels); /* space needed by-size freespace btree */ - min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - min_free += min_t(unsigned int, - pag->pagf_levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, + mp->m_rmap_maxlevels); return min_free; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 4acc6e37c31d..4c2e046fbfad 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4561,7 +4561,7 @@ xfs_bmapi_convert_delalloc( struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); struct xfs_bmalloca bma = { NULL }; - u16 flags = 0; + uint16_t flags = 0; struct xfs_trans *tp; int error; @@ -5404,7 +5404,7 @@ __xfs_bunmapi( * Make sure we don't touch multiple AGF headers out of order * in a single transaction, as that could cause AB-BA deadlocks. */ - if (!wasdel) { + if (!wasdel && !isrt) { agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); if (prev_agno != NULLAGNUMBER && prev_agno > agno) break; @@ -5480,16 +5480,17 @@ __xfs_bunmapi( } div_u64_rem(del.br_startblock, mp->m_sb.sb_rextsize, &mod); if (mod) { + xfs_extlen_t off = mp->m_sb.sb_rextsize - mod; + /* * Realtime extent is lined up at the end but not * at the front. We'll get rid of full extents if * we can. */ - mod = mp->m_sb.sb_rextsize - mod; - if (del.br_blockcount > mod) { - del.br_blockcount -= mod; - del.br_startoff += mod; - del.br_startblock += mod; + if (del.br_blockcount > off) { + del.br_blockcount -= off; + del.br_startoff += off; + del.br_startblock += off; } else if (del.br_startoff == start && (del.br_state == XFS_EXT_UNWRITTEN || tp->t_blk_res == 0)) { @@ -5507,6 +5508,7 @@ __xfs_bunmapi( continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { struct xfs_bmbt_irec prev; + xfs_fileoff_t unwrite_start; /* * This one is already unwritten. @@ -5520,12 +5522,13 @@ __xfs_bunmapi( ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == prev.br_startblock + prev.br_blockcount); - if (prev.br_startoff < start) { - mod = start - prev.br_startoff; - prev.br_blockcount -= mod; - prev.br_startblock += mod; - prev.br_startoff = start; - } + unwrite_start = max3(start, + del.br_startoff - mod, + prev.br_startoff); + mod = unwrite_start - prev.br_startoff; + prev.br_startoff = unwrite_start; + prev.br_startblock += mod; + prev.br_blockcount -= mod; prev.br_state = XFS_EXT_UNWRITTEN; error = xfs_bmap_add_extent_unwritten_real(tp, ip, whichfork, &icur, &cur, @@ -5969,8 +5972,7 @@ xfs_bmap_insert_extents( goto del_cursor; } - if (XFS_IS_CORRUPT(mp, - stop_fsb >= got.br_startoff + got.br_blockcount)) { + if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) { error = -EFSCORRUPTED; goto del_cursor; } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 0aa87cbde49e..dd6fcaaea318 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -724,3 +724,24 @@ xfs_dir2_namecheck( /* There shouldn't be any slashes or nulls here */ return !memchr(name, '/', length) && !memchr(name, 0, length); } + +xfs_dahash_t +xfs_dir2_hashname( + struct xfs_mount *mp, + struct xfs_name *name) +{ + if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + return xfs_ascii_ci_hashname(name); + return xfs_da_hashname(name->name, name->len); +} + +enum xfs_dacmp +xfs_dir2_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + return xfs_ascii_ci_compname(args, name, len); + return xfs_da_compname(args, name, len); +} diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index c031c53d0f0d..01ee0b926572 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -175,6 +175,12 @@ extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); extern int xfs_dir2_sf_removename(struct xfs_da_args *args); extern int xfs_dir2_sf_replace(struct xfs_da_args *args); extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip); +int xfs_dir2_sf_entsize(struct xfs_mount *mp, + struct xfs_dir2_sf_hdr *hdr, int len); +void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, xfs_ino_t ino); +void xfs_dir2_sf_put_ftype(struct xfs_mount *mp, + struct xfs_dir2_sf_entry *sfep, uint8_t ftype); /* xfs_dir2_readdir.c */ extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, @@ -194,25 +200,8 @@ xfs_dir2_data_entsize( return round_up(len, XFS_DIR2_DATA_ALIGN); } -static inline xfs_dahash_t -xfs_dir2_hashname( - struct xfs_mount *mp, - struct xfs_name *name) -{ - if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) - return xfs_ascii_ci_hashname(name); - return xfs_da_hashname(name->name, name->len); -} - -static inline enum xfs_dacmp -xfs_dir2_compname( - struct xfs_da_args *args, - const unsigned char *name, - int len) -{ - if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) - return xfs_ascii_ci_compname(args, name, len); - return xfs_da_compname(args, name, len); -} +xfs_dahash_t xfs_dir2_hashname(struct xfs_mount *mp, struct xfs_name *name); +enum xfs_dacmp xfs_dir2_compname(struct xfs_da_args *args, + const unsigned char *name, int len); #endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 8b94d33d232f..7b7f6fb2ea3b 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -37,7 +37,7 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args); static void xfs_dir2_sf_toino4(xfs_da_args_t *args); static void xfs_dir2_sf_toino8(xfs_da_args_t *args); -static int +int xfs_dir2_sf_entsize( struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -84,7 +84,7 @@ xfs_dir2_sf_get_ino( return get_unaligned_be64(from) & XFS_MAXINUMBER; } -static void +void xfs_dir2_sf_put_ino( struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr, @@ -145,7 +145,7 @@ xfs_dir2_sf_get_ftype( return XFS_DIR3_FT_UNKNOWN; } -static void +void xfs_dir2_sf_put_ftype( struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 988cde7744e6..5b759af4d165 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2909,3 +2909,67 @@ xfs_ialloc_setup_geometry( else igeo->ialloc_align = 0; } + +/* Compute the location of the root directory inode that is laid out by mkfs. */ +xfs_ino_t +xfs_ialloc_calc_rootino( + struct xfs_mount *mp, + int sunit) +{ + struct xfs_ino_geometry *igeo = M_IGEO(mp); + xfs_agblock_t first_bno; + + /* + * Pre-calculate the geometry of AG 0. We know what it looks like + * because libxfs knows how to create allocation groups now. + * + * first_bno is the first block in which mkfs could possibly have + * allocated the root directory inode, once we factor in the metadata + * that mkfs formats before it. Namely, the four AG headers... + */ + first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize); + + /* ...the two free space btree roots... */ + first_bno += 2; + + /* ...the inode btree root... */ + first_bno += 1; + + /* ...the initial AGFL... */ + first_bno += xfs_alloc_min_freelist(mp, NULL); + + /* ...the free inode btree root... */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + first_bno++; + + /* ...the reverse mapping btree root... */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + first_bno++; + + /* ...the reference count btree... */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + first_bno++; + + /* + * ...and the log, if it is allocated in the first allocation group. + * + * This can happen with filesystems that only have a single + * allocation group, or very odd geometries created by old mkfs + * versions on very small filesystems. + */ + if (mp->m_sb.sb_logstart && + XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0) + first_bno += mp->m_sb.sb_logblocks; + + /* + * Now round first_bno up to whatever allocation alignment is given + * by the filesystem or was passed in. + */ + if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + first_bno = roundup(first_bno, sunit); + else if (xfs_sb_version_hasalign(&mp->m_sb) && + mp->m_sb.sb_inoalignmt > 1) + first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); + + return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno)); +} diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 323592d563d5..72b3468b97b1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_setup_geometry(struct xfs_mount *mp); +xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index c55cd9a3dec9..7a9c04920505 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -197,6 +197,24 @@ xfs_calc_inode_chunk_res( } /* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating a realtime extent. We have to be able to log as many rtbitmap + * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, + * as well as the realtime summary block. + */ +static unsigned int +xfs_rtalloc_log_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int rtbmp_bytes; + + rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; +} + +/* * Various log reservation values. * * These are based on the size of the file system block because that is what @@ -218,13 +236,21 @@ xfs_calc_inode_chunk_res( /* * In a write transaction we can allocate a maximum of 2 - * extents. This gives: + * extents. This gives (t1): * the inode getting the new extents: inode size * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: + * Or, if we're writing to a realtime file (t2): + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 1 block + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join (t3): * the agfs of the ags containing the blocks: 2 * sector size * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size @@ -234,40 +260,72 @@ STATIC uint xfs_calc_write_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + + blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + } else { + t2 = 0; + } + + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* - * In truncating a file we free up to two extents at once. We can modify: + * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: + * And the bmap_finish transaction can free the blocks and bmap blocks (t2): * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)))); + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); + + t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + } else { + t3 = 0; + } + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); } /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 2efd78a9719e..e62fb5216341 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -992,6 +992,7 @@ xfs_prepare_shift( struct xfs_inode *ip, loff_t offset) { + struct xfs_mount *mp = ip->i_mount; int error; /* @@ -1005,6 +1006,17 @@ xfs_prepare_shift( } /* + * Shift operations must stabilize the start block offset boundary along + * with the full range of the operation. If we don't, a COW writeback + * completion could race with an insert, front merge with the start + * extent (after split) during the shift and corrupt the file. Start + * with the block just prior to the start to stabilize the boundary. + */ + offset = round_down(offset, 1 << mp->m_sb.sb_blocklog); + if (offset) + offset -= (1 << mp->m_sb.sb_blocklog); + + /* * Writeback and invalidate cache for the remainder of the file as we're * about to shift down every extent from offset to EOF. */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 3458a1264a3f..3984779e5911 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -956,7 +956,7 @@ xfs_buf_item_relse( struct xfs_buf_log_item *bip = bp->b_log_item; trace_xfs_buf_item_relse(bp, _RET_IP_); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); bp->b_log_item = NULL; if (list_empty(&bp->b_li_list)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 6a147c63a8a6..f6006d94a581 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1542,6 +1542,8 @@ out_free_iclog: prev_iclog = iclog->ic_next; kmem_free(iclog->ic_data); kmem_free(iclog); + if (prev_iclog == log->l_iclog) + break; } out_free_log: kmem_free(log); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fca65109cf24..56efe140c923 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -31,7 +31,7 @@ #include "xfs_reflink.h" #include "xfs_extent_busy.h" #include "xfs_health.h" - +#include "xfs_trace.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -360,66 +360,119 @@ release_buf: } /* - * Update alignment values based on mount options and sb values + * If the sunit/swidth change would move the precomputed root inode value, we + * must reject the ondisk change because repair will stumble over that. + * However, we allow the mount to proceed because we never rejected this + * combination before. Returns true to update the sb, false otherwise. + */ +static inline int +xfs_check_new_dalign( + struct xfs_mount *mp, + int new_dalign, + bool *update_sb) +{ + struct xfs_sb *sbp = &mp->m_sb; + xfs_ino_t calc_ino; + + calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign); + trace_xfs_check_new_dalign(mp, new_dalign, calc_ino); + + if (sbp->sb_rootino == calc_ino) { + *update_sb = true; + return 0; + } + + xfs_warn(mp, +"Cannot change stripe alignment; would require moving root inode."); + + /* + * XXX: Next time we add a new incompat feature, this should start + * returning -EINVAL to fail the mount. Until then, spit out a warning + * that we're ignoring the administrator's instructions. + */ + xfs_warn(mp, "Skipping superblock stripe alignment update."); + *update_sb = false; + return 0; +} + +/* + * If we were provided with new sunit/swidth values as mount options, make sure + * that they pass basic alignment and superblock feature checks, and convert + * them into the same units (FSB) that everything else expects. This step + * /must/ be done before computing the inode geometry. */ STATIC int -xfs_update_alignment(xfs_mount_t *mp) +xfs_validate_new_dalign( + struct xfs_mount *mp) { - xfs_sb_t *sbp = &(mp->m_sb); + if (mp->m_dalign == 0) + return 0; - if (mp->m_dalign) { + /* + * If stripe unit and stripe width are not multiples + * of the fs blocksize turn off alignment. + */ + if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || + (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + mp->m_sb.sb_blocksize); + return -EINVAL; + } else { /* - * If stripe unit and stripe width are not multiples - * of the fs blocksize turn off alignment. + * Convert the stripe unit and width to FSBs. */ - if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || - (BBTOB(mp->m_swidth) & mp->m_blockmask)) { + mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); + if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) { xfs_warn(mp, - "alignment check failed: sunit/swidth vs. blocksize(%d)", - sbp->sb_blocksize); + "alignment check failed: sunit/swidth vs. agsize(%d)", + mp->m_sb.sb_agblocks); return -EINVAL; - } else { - /* - * Convert the stripe unit and width to FSBs. - */ - mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); - if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - xfs_warn(mp, - "alignment check failed: sunit/swidth vs. agsize(%d)", - sbp->sb_agblocks); - return -EINVAL; - } else if (mp->m_dalign) { - mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); - } else { - xfs_warn(mp, - "alignment check failed: sunit(%d) less than bsize(%d)", - mp->m_dalign, sbp->sb_blocksize); - return -EINVAL; - } - } - - /* - * Update superblock with new values - * and log changes - */ - if (xfs_sb_version_hasdalign(sbp)) { - if (sbp->sb_unit != mp->m_dalign) { - sbp->sb_unit = mp->m_dalign; - mp->m_update_sb = true; - } - if (sbp->sb_width != mp->m_swidth) { - sbp->sb_width = mp->m_swidth; - mp->m_update_sb = true; - } + } else if (mp->m_dalign) { + mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { xfs_warn(mp, - "cannot change alignment: superblock does not support data alignment"); + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, mp->m_sb.sb_blocksize); return -EINVAL; } + } + + if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + xfs_warn(mp, +"cannot change alignment: superblock does not support data alignment"); + return -EINVAL; + } + + return 0; +} + +/* Update alignment values based on mount options and sb values. */ +STATIC int +xfs_update_alignment( + struct xfs_mount *mp) +{ + struct xfs_sb *sbp = &mp->m_sb; + + if (mp->m_dalign) { + bool update_sb; + int error; + + if (sbp->sb_unit == mp->m_dalign && + sbp->sb_width == mp->m_swidth) + return 0; + + error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb); + if (error || !update_sb) + return error; + + sbp->sb_unit = mp->m_dalign; + sbp->sb_width = mp->m_swidth; + mp->m_update_sb = true; } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && xfs_sb_version_hasdalign(&mp->m_sb)) { - mp->m_dalign = sbp->sb_unit; - mp->m_swidth = sbp->sb_width; + mp->m_dalign = sbp->sb_unit; + mp->m_swidth = sbp->sb_width; } return 0; @@ -648,12 +701,12 @@ xfs_mountfs( } /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. + * If we were given new sunit/swidth options, do some basic validation + * checks and convert the incore dalign and swidth values to the + * same units (FSB) that everything else uses. This /must/ happen + * before computing the inode geometry. */ - error = xfs_update_alignment(mp); + error = xfs_validate_new_dalign(mp); if (error) goto out; @@ -664,6 +717,17 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + /* + * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks + * is NOT aligned turn off m_dalign since allocator alignment is within + * an ag, therefore ag has to be aligned at stripe boundary. Note that + * we must compute the free space and rmap btree geometry before doing + * this. + */ + error = xfs_update_alignment(mp); + if (error) + goto out; + /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index c13bb3655e48..a86be7f807ee 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3573,6 +3573,27 @@ DEFINE_KMEM_EVENT(kmem_alloc_large); DEFINE_KMEM_EVENT(kmem_realloc); DEFINE_KMEM_EVENT(kmem_zone_alloc); +TRACE_EVENT(xfs_check_new_dalign, + TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), + TP_ARGS(mp, new_dalign, calc_rootino), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, new_dalign) + __field(xfs_ino_t, sb_rootino) + __field(xfs_ino_t, calc_rootino) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->new_dalign = new_dalign; + __entry->sb_rootino = mp->m_sb.sb_rootino; + __entry->calc_rootino = calc_rootino; + ), + TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_dalign, __entry->sb_rootino, + __entry->calc_rootino) +) + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH |