diff options
Diffstat (limited to 'fs')
171 files changed, 7306 insertions, 3457 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 5a0db6dec8d1..aaee1e6584e6 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -40,6 +40,9 @@ */ #define P9_LOCK_TIMEOUT (30*HZ) +/* flags for v9fs_stat2inode() & v9fs_stat2inode_dotl() */ +#define V9FS_STAT2INODE_KEEP_ISIZE 1 + extern struct file_system_type v9fs_fs_type; extern const struct address_space_operations v9fs_addr_operations; extern const struct file_operations v9fs_file_operations; @@ -61,8 +64,10 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); -void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); -void v9fs_stat2inode_dotl(struct p9_stat_dotl *, struct inode *); +void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, + struct super_block *sb, unsigned int flags); +void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags); int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); @@ -83,4 +88,18 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) } int v9fs_open_to_dotl_flags(int flags); + +static inline void v9fs_i_size_write(struct inode *inode, loff_t i_size) +{ + /* + * 32-bit need the lock, concurrent updates could break the + * sequences and make i_size_read() loop forever. + * 64-bit updates are atomic and can skip the locking. + */ + if (sizeof(i_size) > sizeof(long)) + spin_lock(&inode->i_lock); + i_size_write(inode, i_size); + if (sizeof(i_size) > sizeof(long)) + spin_unlock(&inode->i_lock); +} #endif diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index a25efa782fcc..9a1125305d84 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -446,7 +446,11 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) i_size = i_size_read(inode); if (iocb->ki_pos > i_size) { inode_add_bytes(inode, iocb->ki_pos - i_size); - i_size_write(inode, iocb->ki_pos); + /* + * Need to serialize against i_size_write() in + * v9fs_stat2inode() + */ + v9fs_i_size_write(inode, iocb->ki_pos); } return retval; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 85ff859d3af5..72b779bc0942 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -538,7 +538,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode(st, inode, sb); + v9fs_stat2inode(st, inode, sb, 0); v9fs_cache_inode_get_cookie(inode); unlock_new_inode(inode); return inode; @@ -1092,7 +1092,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb); + v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); generic_fillattr(d_inode(dentry), stat); p9stat_free(st); @@ -1170,12 +1170,13 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) * @stat: Plan 9 metadata (mistat) structure * @inode: inode to populate * @sb: superblock of filesystem + * @flags: control flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, - struct super_block *sb) + struct super_block *sb, unsigned int flags) { umode_t mode; char ext[32]; @@ -1216,10 +1217,11 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, mode = p9mode2perm(v9ses, stat); mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->length); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ - inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; + inode->i_blocks = (stat->length + 512 - 1) >> 9; v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR; } @@ -1416,9 +1418,9 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { int umode; dev_t rdev; - loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_stat(fid); @@ -1431,16 +1433,13 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode(st, inode, inode->i_sb); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode(st, inode, inode->i_sb, flags); out: p9stat_free(st); kfree(st); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4823e1c46999..a950a927a626 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -143,7 +143,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, if (retval) goto error; - v9fs_stat2inode_dotl(st, inode); + v9fs_stat2inode_dotl(st, inode, 0); v9fs_cache_inode_get_cookie(inode); retval = v9fs_get_acl(inode, fid); if (retval) @@ -496,7 +496,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, d_inode(dentry)); + v9fs_stat2inode_dotl(st, d_inode(dentry), 0); generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -607,11 +607,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate + * @flags: ctrl flags (e.g. V9FS_STAT2INODE_KEEP_ISIZE) * */ void -v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) +v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, + unsigned int flags) { umode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -631,7 +633,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE)) + v9fs_i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { @@ -661,8 +664,9 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) } if (stat->st_result_mask & P9_STATS_RDEV) inode->i_rdev = new_decode_dev(stat->st_rdev); - if (stat->st_result_mask & P9_STATS_SIZE) - i_size_write(inode, stat->st_size); + if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && + stat->st_result_mask & P9_STATS_SIZE) + v9fs_i_size_write(inode, stat->st_size); if (stat->st_result_mask & P9_STATS_BLOCKS) inode->i_blocks = stat->st_blocks; } @@ -928,9 +932,9 @@ v9fs_vfs_get_link_dotl(struct dentry *dentry, int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) { - loff_t i_size; struct p9_stat_dotl *st; struct v9fs_session_info *v9ses; + unsigned int flags; v9ses = v9fs_inode2v9ses(inode); st = p9_client_getattr_dotl(fid, P9_STATS_ALL); @@ -942,16 +946,13 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) goto out; - spin_lock(&inode->i_lock); /* * We don't want to refresh inode->i_size, * because we may have cached data */ - i_size = inode->i_size; - v9fs_stat2inode_dotl(st, inode); - if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - inode->i_size = i_size; - spin_unlock(&inode->i_lock); + flags = (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) ? + V9FS_STAT2INODE_KEEP_ISIZE : 0; + v9fs_stat2inode_dotl(st, inode, flags); out: kfree(st); return 0; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 48ce50484e80..d13d35cf69c0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -92,7 +92,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, return ret; if (v9ses->cache) - sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; if (!v9ses->cache) @@ -172,7 +172,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, d_inode(root)); + v9fs_stat2inode_dotl(st, d_inode(root), 0); kfree(st); } else { struct p9_wstat *st = NULL; @@ -183,7 +183,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, d_inode(root), sb); + v9fs_stat2inode(st, d_inode(root), sb, 0); p9stat_free(st); kfree(st); diff --git a/fs/Kconfig b/fs/Kconfig index 2557506051a3..3e6d3101f3ff 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -8,6 +8,13 @@ menu "File systems" config DCACHE_WORD_ACCESS bool +config VALIDATE_FS_PARSER + bool "Validate filesystem parameter description" + default y + help + Enable this to perform validation of the parameter description for a + filesystem when it is registered. + if BLOCK config FS_IOMAP diff --git a/fs/Makefile b/fs/Makefile index 7bff9abecfa4..427fec226fae 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.o super.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o splice.o sync.o utimes.o d_path.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ - fs_types.o + fs_types.o fs_context.o fs_parser.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o block_dev.o direct-io.o mpage.o diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 8871b9e8645f..bb1f244b2b3a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -36,15 +36,14 @@ struct pagevec; struct afs_call; -struct afs_mount_params { - bool rwpath; /* T if the parent should be considered R/W */ +struct afs_fs_context { bool force; /* T to force cell type */ bool autocell; /* T if set auto mount operation */ bool dyn_root; /* T if dynamic root */ + bool no_cell; /* T if the source is "none" (for dynroot) */ afs_voltype_t type; /* type of volume requested */ - int volnamesz; /* size of volume name */ + unsigned int volnamesz; /* size of volume name */ const char *volname; /* name of volume to mount */ - struct net *net_ns; /* Network namespace in effect */ struct afs_net *net; /* the AFS net namespace stuff */ struct afs_cell *cell; /* cell in which to find volume */ struct afs_volume *volume; /* volume record */ @@ -1274,7 +1273,7 @@ static inline struct afs_volume *__afs_get_volume(struct afs_volume *volume) return volume; } -extern struct afs_volume *afs_create_volume(struct afs_mount_params *); +extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern void afs_put_volume(struct afs_cell *, struct afs_volume *); diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 2e51c6994148..eecd8b699186 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -17,6 +17,7 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/gfp.h> +#include <linux/fs_context.h> #include "internal.h" @@ -47,6 +48,8 @@ static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out); static unsigned long afs_mntpt_expiry_timeout = 10 * 60; +static const char afs_root_volume[] = "root.cell"; + /* * no valid lookup procedure on this sort of dir */ @@ -68,108 +71,112 @@ static int afs_mntpt_open(struct inode *inode, struct file *file) } /* - * create a vfsmount to be automounted + * Set the parameters for the proposed superblock. */ -static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) { - struct afs_super_info *as; - struct vfsmount *mnt; - struct afs_vnode *vnode; - struct page *page; - char *devname, *options; - bool rwpath = false; + struct afs_fs_context *ctx = fc->fs_private; + struct afs_super_info *src_as = AFS_FS_S(mntpt->d_sb); + struct afs_vnode *vnode = AFS_FS_I(d_inode(mntpt)); + struct afs_cell *cell; + const char *p; int ret; - _enter("{%pd}", mntpt); - - BUG_ON(!d_inode(mntpt)); - - ret = -ENOMEM; - devname = (char *) get_zeroed_page(GFP_KERNEL); - if (!devname) - goto error_no_devname; - - options = (char *) get_zeroed_page(GFP_KERNEL); - if (!options) - goto error_no_options; + if (fc->net_ns != src_as->net_ns) { + put_net(fc->net_ns); + fc->net_ns = get_net(src_as->net_ns); + } - vnode = AFS_FS_I(d_inode(mntpt)); + if (src_as->volume && src_as->volume->type == AFSVL_RWVOL) { + ctx->type = AFSVL_RWVOL; + ctx->force = true; + } + if (ctx->cell) { + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = NULL; + } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ - static const char afs_root_cell[] = ":root.cell."; unsigned size = mntpt->d_name.len; - ret = -ENOENT; - if (size < 2 || size > AFS_MAXCELLNAME) - goto error_no_page; + if (size < 2) + return -ENOENT; + p = mntpt->d_name.name; if (mntpt->d_name.name[0] == '.') { - devname[0] = '%'; - memcpy(devname + 1, mntpt->d_name.name + 1, size - 1); - memcpy(devname + size, afs_root_cell, - sizeof(afs_root_cell)); - rwpath = true; - } else { - devname[0] = '#'; - memcpy(devname + 1, mntpt->d_name.name, size); - memcpy(devname + size + 1, afs_root_cell, - sizeof(afs_root_cell)); + size--; + p++; + ctx->type = AFSVL_RWVOL; + ctx->force = true; } + if (size > AFS_MAXCELLNAME) + return -ENAMETOOLONG; + + cell = afs_lookup_cell(ctx->net, p, size, NULL, false); + if (IS_ERR(cell)) { + pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); + return PTR_ERR(cell); + } + ctx->cell = cell; + + ctx->volname = afs_root_volume; + ctx->volnamesz = sizeof(afs_root_volume) - 1; } else { /* read the contents of the AFS special symlink */ + struct page *page; loff_t size = i_size_read(d_inode(mntpt)); char *buf; - ret = -EINVAL; + if (src_as->cell) + ctx->cell = afs_get_cell(src_as->cell); + if (size > PAGE_SIZE - 1) - goto error_no_page; + return -EINVAL; page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); - if (IS_ERR(page)) { - ret = PTR_ERR(page); - goto error_no_page; - } + if (IS_ERR(page)) + return PTR_ERR(page); if (PageError(page)) { ret = afs_bad(AFS_FS_I(d_inode(mntpt)), afs_file_error_mntpt); - goto error; + put_page(page); + return ret; } - buf = kmap_atomic(page); - memcpy(devname, buf, size); - kunmap_atomic(buf); + buf = kmap(page); + ret = vfs_parse_fs_string(fc, "source", buf, size); + kunmap(page); put_page(page); - page = NULL; + if (ret < 0) + return ret; } - /* work out what options we want */ - as = AFS_FS_S(mntpt->d_sb); - if (as->cell) { - memcpy(options, "cell=", 5); - strcpy(options + 5, as->cell->name); - if ((as->volume && as->volume->type == AFSVL_RWVOL) || rwpath) - strcat(options, ",rwpath"); - } + return 0; +} - /* try and do the mount */ - _debug("--- attempting mount %s -o %s ---", devname, options); - mnt = vfs_submount(mntpt, &afs_fs_type, devname, options); - _debug("--- mount result %p ---", mnt); +/* + * create a vfsmount to be automounted + */ +static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret; - free_page((unsigned long) devname); - free_page((unsigned long) options); - _leave(" = %p", mnt); - return mnt; + BUG_ON(!d_inode(mntpt)); -error: - put_page(page); -error_no_page: - free_page((unsigned long) options); -error_no_options: - free_page((unsigned long) devname); -error_no_devname: - _leave(" = %d", ret); - return ERR_PTR(ret); + fc = fs_context_for_submount(&afs_fs_type, mntpt); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ret = afs_mntpt_set_params(fc, mntpt); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; } /* diff --git a/fs/afs/super.c b/fs/afs/super.c index dcd07fe99871..5adf012b8e27 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -1,6 +1,6 @@ /* AFS superblock handling * - * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved. + * Copyright (c) 2002, 2007, 2018 Red Hat, Inc. All rights reserved. * * This software may be freely redistributed under the terms of the * GNU General Public License. @@ -21,7 +21,7 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/statfs.h> #include <linux/sched.h> #include <linux/nsproxy.h> @@ -30,21 +30,22 @@ #include "internal.h" static void afs_i_init_once(void *foo); -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data); static void afs_kill_super(struct super_block *sb); static struct inode *afs_alloc_inode(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); static int afs_show_devname(struct seq_file *m, struct dentry *root); static int afs_show_options(struct seq_file *m, struct dentry *root); +static int afs_init_fs_context(struct fs_context *fc); +static const struct fs_parameter_description afs_fs_parameters; struct file_system_type afs_fs_type = { - .owner = THIS_MODULE, - .name = "afs", - .mount = afs_mount, - .kill_sb = afs_kill_super, - .fs_flags = 0, + .owner = THIS_MODULE, + .name = "afs", + .init_fs_context = afs_init_fs_context, + .parameters = &afs_fs_parameters, + .kill_sb = afs_kill_super, + .fs_flags = 0, }; MODULE_ALIAS_FS("afs"); @@ -63,22 +64,22 @@ static const struct super_operations afs_super_ops = { static struct kmem_cache *afs_inode_cachep; static atomic_t afs_count_active_inodes; -enum { - afs_no_opt, - afs_opt_cell, - afs_opt_dyn, - afs_opt_rwpath, - afs_opt_vol, - afs_opt_autocell, +enum afs_param { + Opt_autocell, + Opt_dyn, + Opt_source, }; -static const match_table_t afs_options_list = { - { afs_opt_cell, "cell=%s" }, - { afs_opt_dyn, "dyn" }, - { afs_opt_rwpath, "rwpath" }, - { afs_opt_vol, "vol=%s" }, - { afs_opt_autocell, "autocell" }, - { afs_no_opt, NULL }, +static const struct fs_parameter_spec afs_param_specs[] = { + fsparam_flag ("autocell", Opt_autocell), + fsparam_flag ("dyn", Opt_dyn), + fsparam_string("source", Opt_source), + {} +}; + +static const struct fs_parameter_description afs_fs_parameters = { + .name = "kAFS", + .specs = afs_param_specs, }; /* @@ -190,84 +191,23 @@ static int afs_show_options(struct seq_file *m, struct dentry *root) } /* - * parse the mount options - * - this function has been shamelessly adapted from the ext3 fs which - * shamelessly adapted it from the msdos fs - */ -static int afs_parse_options(struct afs_mount_params *params, - char *options, const char **devname) -{ - struct afs_cell *cell; - substring_t args[MAX_OPT_ARGS]; - char *p; - int token; - - _enter("%s", options); - - options[PAGE_SIZE - 1] = 0; - - while ((p = strsep(&options, ","))) { - if (!*p) - continue; - - token = match_token(p, afs_options_list, args); - switch (token) { - case afs_opt_cell: - rcu_read_lock(); - cell = afs_lookup_cell_rcu(params->net, - args[0].from, - args[0].to - args[0].from); - rcu_read_unlock(); - if (IS_ERR(cell)) - return PTR_ERR(cell); - afs_put_cell(params->net, params->cell); - params->cell = cell; - break; - - case afs_opt_rwpath: - params->rwpath = true; - break; - - case afs_opt_vol: - *devname = args[0].from; - break; - - case afs_opt_autocell: - params->autocell = true; - break; - - case afs_opt_dyn: - params->dyn_root = true; - break; - - default: - printk(KERN_ERR "kAFS:" - " Unknown or invalid mount option: '%s'\n", p); - return -EINVAL; - } - } - - _leave(" = 0"); - return 0; -} - -/* - * parse a device name to get cell name, volume name, volume type and R/W - * selector - * - this can be one of the following: + * Parse the source name to get cell name, volume name, volume type and R/W + * selector. + * + * This can be one of the following: * "%[cell:]volume[.]" R/W volume - * "#[cell:]volume[.]" R/O or R/W volume (rwpath=0), - * or R/W (rwpath=1) volume + * "#[cell:]volume[.]" R/O or R/W volume (R/O parent), + * or R/W (R/W parent) volume * "%[cell:]volume.readonly" R/O volume * "#[cell:]volume.readonly" R/O volume * "%[cell:]volume.backup" Backup volume * "#[cell:]volume.backup" Backup volume */ -static int afs_parse_device_name(struct afs_mount_params *params, - const char *name) +static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_cell *cell; - const char *cellname, *suffix; + const char *cellname, *suffix, *name = param->string; int cellnamesz; _enter(",%s", name); @@ -278,69 +218,149 @@ static int afs_parse_device_name(struct afs_mount_params *params, } if ((name[0] != '%' && name[0] != '#') || !name[1]) { + /* To use dynroot, we don't want to have to provide a source */ + if (strcmp(name, "none") == 0) { + ctx->no_cell = true; + return 0; + } printk(KERN_ERR "kAFS: unparsable volume name\n"); return -EINVAL; } /* determine the type of volume we're looking for */ - params->type = AFSVL_ROVOL; - params->force = false; - if (params->rwpath || name[0] == '%') { - params->type = AFSVL_RWVOL; - params->force = true; + if (name[0] == '%') { + ctx->type = AFSVL_RWVOL; + ctx->force = true; } name++; /* split the cell name out if there is one */ - params->volname = strchr(name, ':'); - if (params->volname) { + ctx->volname = strchr(name, ':'); + if (ctx->volname) { cellname = name; - cellnamesz = params->volname - name; - params->volname++; + cellnamesz = ctx->volname - name; + ctx->volname++; } else { - params->volname = name; + ctx->volname = name; cellname = NULL; cellnamesz = 0; } /* the volume type is further affected by a possible suffix */ - suffix = strrchr(params->volname, '.'); + suffix = strrchr(ctx->volname, '.'); if (suffix) { if (strcmp(suffix, ".readonly") == 0) { - params->type = AFSVL_ROVOL; - params->force = true; + ctx->type = AFSVL_ROVOL; + ctx->force = true; } else if (strcmp(suffix, ".backup") == 0) { - params->type = AFSVL_BACKVOL; - params->force = true; + ctx->type = AFSVL_BACKVOL; + ctx->force = true; } else if (suffix[1] == 0) { } else { suffix = NULL; } } - params->volnamesz = suffix ? - suffix - params->volname : strlen(params->volname); + ctx->volnamesz = suffix ? + suffix - ctx->volname : strlen(ctx->volname); _debug("cell %*.*s [%p]", - cellnamesz, cellnamesz, cellname ?: "", params->cell); + cellnamesz, cellnamesz, cellname ?: "", ctx->cell); /* lookup the cell record */ - if (cellname || !params->cell) { - cell = afs_lookup_cell(params->net, cellname, cellnamesz, + if (cellname) { + cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, NULL, false); if (IS_ERR(cell)) { - printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n", + pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_put_cell(params->net, params->cell); - params->cell = cell; + afs_put_cell(ctx->net, ctx->cell); + ctx->cell = cell; } _debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s", - params->cell->name, params->cell, - params->volnamesz, params->volnamesz, params->volname, - suffix ?: "-", params->type, params->force ? " FORCE" : ""); + ctx->cell->name, ctx->cell, + ctx->volnamesz, ctx->volnamesz, ctx->volname, + suffix ?: "-", ctx->type, ctx->force ? " FORCE" : ""); + + fc->source = param->string; + param->string = NULL; + return 0; +} + +/* + * Parse a single mount parameter. + */ +static int afs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct fs_parse_result result; + struct afs_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, &afs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_source: + return afs_parse_source(fc, param); + + case Opt_autocell: + ctx->autocell = true; + break; + + case Opt_dyn: + ctx->dyn_root = true; + break; + + default: + return -EINVAL; + } + + _leave(" = 0"); + return 0; +} + +/* + * Validate the options, get the cell key and look up the volume. + */ +static int afs_validate_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + struct afs_volume *volume; + struct key *key; + + if (!ctx->dyn_root) { + if (ctx->no_cell) { + pr_warn("kAFS: Can only specify source 'none' with -o dyn\n"); + return -EINVAL; + } + + if (!ctx->cell) { + pr_warn("kAFS: No cell specified\n"); + return -EDESTADDRREQ; + } + + /* We try to do the mount securely. */ + key = afs_request_key(ctx->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + + ctx->key = key; + + if (ctx->volume) { + afs_put_volume(ctx->cell, ctx->volume); + ctx->volume = NULL; + } + + volume = afs_create_volume(ctx); + if (IS_ERR(volume)) + return PTR_ERR(volume); + + ctx->volume = volume; + } return 0; } @@ -348,39 +368,34 @@ static int afs_parse_device_name(struct afs_mount_params *params, /* * check a superblock to see if it's the one we're looking for */ -static int afs_test_super(struct super_block *sb, void *data) +static int afs_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net_ns == as1->net_ns && + return (as->net_ns == fc->net_ns && as->volume && - as->volume->vid == as1->volume->vid && + as->volume->vid == ctx->volume->vid && !as->dyn_root); } -static int afs_dynroot_test_super(struct super_block *sb, void *data) +static int afs_dynroot_test_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as1 = data; struct afs_super_info *as = AFS_FS_S(sb); - return (as->net_ns == as1->net_ns && + return (as->net_ns == fc->net_ns && as->dyn_root); } -static int afs_set_super(struct super_block *sb, void *data) +static int afs_set_super(struct super_block *sb, struct fs_context *fc) { - struct afs_super_info *as = data; - - sb->s_fs_info = as; return set_anon_super(sb, NULL); } /* * fill in the superblock */ -static int afs_fill_super(struct super_block *sb, - struct afs_mount_params *params) +static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) { struct afs_super_info *as = AFS_FS_S(sb); struct afs_fid fid; @@ -399,7 +414,7 @@ static int afs_fill_super(struct super_block *sb, ret = super_setup_bdi(sb); if (ret) return ret; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* allocate the root inode and dentry */ if (as->dyn_root) { @@ -412,13 +427,13 @@ static int afs_fill_super(struct super_block *sb, fid.vnode = 1; fid.vnode_hi = 0; fid.unique = 1; - inode = afs_iget(sb, params->key, &fid, NULL, NULL, NULL); + inode = afs_iget(sb, ctx->key, &fid, NULL, NULL, NULL); } if (IS_ERR(inode)) return PTR_ERR(inode); - if (params->autocell || params->dyn_root) + if (ctx->autocell || as->dyn_root) set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); ret = -ENOMEM; @@ -443,17 +458,20 @@ error: return ret; } -static struct afs_super_info *afs_alloc_sbi(struct afs_mount_params *params) +static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) { + struct afs_fs_context *ctx = fc->fs_private; struct afs_super_info *as; as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL); if (as) { - as->net_ns = get_net(params->net_ns); - if (params->dyn_root) + as->net_ns = get_net(fc->net_ns); + if (ctx->dyn_root) { as->dyn_root = true; - else - as->cell = afs_get_cell(params->cell); + } else { + as->cell = afs_get_cell(ctx->cell); + as->volume = __afs_get_volume(ctx->volume); + } } return as; } @@ -475,7 +493,7 @@ static void afs_kill_super(struct super_block *sb) if (as->dyn_root) afs_dynroot_depopulate(sb); - + /* Clear the callback interests (which will do ilookup5) before * deactivating the superblock. */ @@ -488,111 +506,103 @@ static void afs_kill_super(struct super_block *sb) } /* - * get an AFS superblock + * Get an AFS superblock and root directory. */ -static struct dentry *afs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *options) +static int afs_get_tree(struct fs_context *fc) { - struct afs_mount_params params; + struct afs_fs_context *ctx = fc->fs_private; struct super_block *sb; - struct afs_volume *candidate; - struct key *key; struct afs_super_info *as; int ret; - _enter(",,%s,%p", dev_name, options); - - memset(¶ms, 0, sizeof(params)); - - ret = -EINVAL; - if (current->nsproxy->net_ns != &init_net) + ret = afs_validate_fc(fc); + if (ret) goto error; - params.net_ns = current->nsproxy->net_ns; - params.net = afs_net(params.net_ns); - - /* parse the options and device name */ - if (options) { - ret = afs_parse_options(¶ms, options, &dev_name); - if (ret < 0) - goto error; - } - - if (!params.dyn_root) { - ret = afs_parse_device_name(¶ms, dev_name); - if (ret < 0) - goto error; - /* try and do the mount securely */ - key = afs_request_key(params.cell); - if (IS_ERR(key)) { - _leave(" = %ld [key]", PTR_ERR(key)); - ret = PTR_ERR(key); - goto error; - } - params.key = key; - } + _enter(""); /* allocate a superblock info record */ ret = -ENOMEM; - as = afs_alloc_sbi(¶ms); + as = afs_alloc_sbi(fc); if (!as) - goto error_key; - - if (!params.dyn_root) { - /* Assume we're going to need a volume record; at the very - * least we can use it to update the volume record if we have - * one already. This checks that the volume exists within the - * cell. - */ - candidate = afs_create_volume(¶ms); - if (IS_ERR(candidate)) { - ret = PTR_ERR(candidate); - goto error_as; - } - - as->volume = candidate; - } + goto error; + fc->s_fs_info = as; /* allocate a deviceless superblock */ - sb = sget(fs_type, - as->dyn_root ? afs_dynroot_test_super : afs_test_super, - afs_set_super, flags, as); + sb = sget_fc(fc, + as->dyn_root ? afs_dynroot_test_super : afs_test_super, + afs_set_super); if (IS_ERR(sb)) { ret = PTR_ERR(sb); - goto error_as; + goto error; } if (!sb->s_root) { /* initial superblock/root creation */ _debug("create"); - ret = afs_fill_super(sb, ¶ms); + ret = afs_fill_super(sb, ctx); if (ret < 0) goto error_sb; - as = NULL; sb->s_flags |= SB_ACTIVE; } else { _debug("reuse"); ASSERTCMP(sb->s_flags, &, SB_ACTIVE); - afs_destroy_sbi(as); - as = NULL; } - afs_put_cell(params.net, params.cell); - key_put(params.key); + fc->root = dget(sb->s_root); _leave(" = 0 [%p]", sb); - return dget(sb->s_root); + return 0; error_sb: deactivate_locked_super(sb); - goto error_key; -error_as: - afs_destroy_sbi(as); -error_key: - key_put(params.key); error: - afs_put_cell(params.net, params.cell); _leave(" = %d", ret); - return ERR_PTR(ret); + return ret; +} + +static void afs_free_fc(struct fs_context *fc) +{ + struct afs_fs_context *ctx = fc->fs_private; + + afs_destroy_sbi(fc->s_fs_info); + afs_put_volume(ctx->cell, ctx->volume); + afs_put_cell(ctx->net, ctx->cell); + key_put(ctx->key); + kfree(ctx); +} + +static const struct fs_context_operations afs_context_ops = { + .free = afs_free_fc, + .parse_param = afs_parse_param, + .get_tree = afs_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int afs_init_fs_context(struct fs_context *fc) +{ + struct afs_fs_context *ctx; + struct afs_cell *cell; + + ctx = kzalloc(sizeof(struct afs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->type = AFSVL_ROVOL; + ctx->net = afs_net(fc->net_ns); + + /* Default to the workstation cell. */ + rcu_read_lock(); + cell = afs_lookup_cell_rcu(ctx->net, NULL, 0); + rcu_read_unlock(); + if (IS_ERR(cell)) + cell = NULL; + ctx->cell = cell; + + fc->fs_private = ctx; + fc->ops = &afs_context_ops; + return 0; } /* diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 00975ed3640f..f6eba2def0a1 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -21,7 +21,7 @@ static const char *const afs_voltypes[] = { "R/W", "R/O", "BAK" }; /* * Allocate a volume record and load it up from a vldb record. */ -static struct afs_volume *afs_alloc_volume(struct afs_mount_params *params, +static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, struct afs_vldb_entry *vldb, unsigned long type_mask) { @@ -113,7 +113,7 @@ static struct afs_vldb_entry *afs_vl_lookup_vldb(struct afs_cell *cell, * - Rule 3: If parent volume is R/W, then only mount R/W volume unless * explicitly told otherwise */ -struct afs_volume *afs_create_volume(struct afs_mount_params *params) +struct afs_volume *afs_create_volume(struct afs_fs_context *params) { struct afs_vldb_entry *vldb; struct afs_volume *volume; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 129d26226e70..b3642367a595 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1210,6 +1210,8 @@ enum { * Set for the subvolume tree owning the reloc tree. */ BTRFS_ROOT_DEAD_RELOC_TREE, + /* Mark dead root stored on device whose cleanup needs to be resumed */ + BTRFS_ROOT_DEAD_TREE, }; /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f0cdb53f3e2d..6fe9197f6ee4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2958,7 +2958,7 @@ int open_ctree(struct super_block *sb, sb->s_bdi->congested_fn = btrfs_congested_fn; sb->s_bdi->congested_data = fs_info; sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 994f0cc41799..1d49694e6ae3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8764,6 +8764,8 @@ struct walk_control { u64 refs[BTRFS_MAX_LEVEL]; u64 flags[BTRFS_MAX_LEVEL]; struct btrfs_key update_progress; + struct btrfs_key drop_progress; + int drop_level; int stage; int level; int shared_level; @@ -8771,6 +8773,7 @@ struct walk_control { int keep_locks; int reada_slot; int reada_count; + int restarted; }; #define DROP_REFERENCE 1 @@ -8934,6 +8937,33 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, } /* + * This is used to verify a ref exists for this root to deal with a bug where we + * would have a drop_progress key that hadn't been updated properly. + */ +static int check_ref_exists(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 parent, + int level) +{ + struct btrfs_path *path; + struct btrfs_extent_inline_ref *iref; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = lookup_extent_backref(trans, path, &iref, bytenr, + root->fs_info->nodesize, parent, + root->root_key.objectid, level, 0); + btrfs_free_path(path); + if (ret == -ENOENT) + return 0; + if (ret < 0) + return ret; + return 1; +} + +/* * helper to process tree block pointer. * * when wc->stage == DROP_REFERENCE, this function checks @@ -9088,6 +9118,23 @@ skip: } /* + * If we had a drop_progress we need to verify the refs are set + * as expected. If we find our ref then we know that from here + * on out everything should be correct, and we can clear the + * ->restarted flag. + */ + if (wc->restarted) { + ret = check_ref_exists(trans, root, bytenr, parent, + level - 1); + if (ret < 0) + goto out_unlock; + if (ret == 0) + goto no_delete; + ret = 0; + wc->restarted = 0; + } + + /* * Reloc tree doesn't contribute to qgroup numbers, and we have * already accounted them at merge time (replace_path), * thus we could skip expensive subtree trace here. @@ -9102,13 +9149,23 @@ skip: ret); } } + + /* + * We need to update the next key in our walk control so we can + * update the drop_progress key accordingly. We don't care if + * find_next_key doesn't find a key because that means we're at + * the end and are going to clean up now. + */ + wc->drop_level = level; + find_next_key(path, level, &wc->drop_progress); + ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize, parent, root->root_key.objectid, level - 1, 0); if (ret) goto out_unlock; } - +no_delete: *lookup_info = 1; ret = 1; @@ -9425,6 +9482,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } } + wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state); wc->level = level; wc->shared_level = -1; wc->stage = DROP_REFERENCE; @@ -9452,12 +9510,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } if (wc->stage == DROP_REFERENCE) { - level = wc->level; - btrfs_node_key(path->nodes[level], - &root_item->drop_progress, - path->slots[level]); - root_item->drop_level = level; - } + wc->drop_level = wc->level; + btrfs_node_key_to_cpu(path->nodes[wc->drop_level], + &wc->drop_progress, + path->slots[wc->drop_level]); + } + btrfs_cpu_key_to_disk(&root_item->drop_progress, + &wc->drop_progress); + root_item->drop_level = wc->drop_level; BUG_ON(wc->level == 0); if (btrfs_should_end_transaction(trans) || diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ab705183d749..ca8b8e785cf3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2995,11 +2995,11 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && prev_em_start && *prev_em_start != (u64)-1 && - *prev_em_start != em->orig_start) + *prev_em_start != em->start) force_bio_submit = true; if (prev_em_start) - *prev_em_start = em->orig_start; + *prev_em_start = em->start; free_extent_map(em); em = NULL; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 494f0f10d70e..ec2d8919e7fb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3207,21 +3207,6 @@ out: return ret; } -static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) -{ - inode_unlock(inode1); - inode_unlock(inode2); -} - -static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) -{ - if (inode1 < inode2) - swap(inode1, inode2); - - inode_lock_nested(inode1, I_MUTEX_PARENT); - inode_lock_nested(inode2, I_MUTEX_CHILD); -} - static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { @@ -3956,7 +3941,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (same_inode) inode_lock(inode_in); else - btrfs_double_inode_lock(inode_in, inode_out); + lock_two_nondirectories(inode_in, inode_out); /* don't make the dst file partly checksummed */ if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != @@ -4013,7 +3998,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, if (same_inode) inode_unlock(inode_in); else - btrfs_double_inode_unlock(inode_in, inode_out); + unlock_two_nondirectories(inode_in, inode_out); return ret; } @@ -4043,7 +4028,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, if (same_inode) inode_unlock(src_inode); else - btrfs_double_inode_unlock(src_inode, dst_inode); + unlock_two_nondirectories(src_inode, dst_inode); return ret < 0 ? ret : len; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c1cd5558a646..eb680b715dd6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -894,6 +894,12 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) if (fs_info->quota_root) goto out; + fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); + if (!fs_info->qgroup_ulist) { + ret = -ENOMEM; + goto out; + } + /* * 1 for quota root item * 1 for BTRFS_QGROUP_STATUS item @@ -909,13 +915,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out; } - fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); - if (!fs_info->qgroup_ulist) { - ret = -ENOMEM; - btrfs_abort_transaction(trans, ret); - goto out; - } - /* * initially create the quota tree */ diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 0d2b957ca3a3..893d12fbfda0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -263,8 +263,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) if (root) { WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); - if (btrfs_root_refs(&root->root_item) == 0) + if (btrfs_root_refs(&root->root_item) == 0) { + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); + } continue; } @@ -310,8 +312,10 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) break; } - if (btrfs_root_refs(&root->root_item) == 0) + if (btrfs_root_refs(&root->root_item) == 0) { + set_bit(BTRFS_ROOT_DEAD_TREE, &root->state); btrfs_add_dead_root(root); + } } btrfs_free_path(path); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 3e418a3aeb11..6b9e29d050f3 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -195,8 +195,7 @@ static void zstd_cleanup_workspace_manager(void) struct workspace *workspace; int i; - del_timer(&wsm.timer); - + spin_lock(&wsm.lock); for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { while (!list_empty(&wsm.idle_ws[i])) { workspace = container_of(wsm.idle_ws[i].next, @@ -206,6 +205,9 @@ static void zstd_cleanup_workspace_manager(void) wsm.ops->free_workspace(&workspace->list); } } + spin_unlock(&wsm.lock); + + del_timer_sync(&wsm.timer); } /* diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index bba28a5034ba..36a8dc699448 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -148,11 +148,17 @@ void ceph_caps_finalize(struct ceph_mds_client *mdsc) spin_unlock(&mdsc->caps_list_lock); } -void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) +void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt) { spin_lock(&mdsc->caps_list_lock); - mdsc->caps_min_count += delta; - BUG_ON(mdsc->caps_min_count < 0); + mdsc->caps_min_count = fsopt->max_readdir; + if (mdsc->caps_min_count < 1024) + mdsc->caps_min_count = 1024; + mdsc->caps_use_max = fsopt->caps_max; + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_max < mdsc->caps_min_count) + mdsc->caps_use_max = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } @@ -272,6 +278,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, if (!err) { BUG_ON(have + alloc != need); ctx->count = need; + ctx->used = 0; } spin_lock(&mdsc->caps_list_lock); @@ -295,13 +302,24 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, - struct ceph_cap_reservation *ctx) + struct ceph_cap_reservation *ctx) { + bool reclaim = false; + if (!ctx->count) + return; + dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); spin_lock(&mdsc->caps_list_lock); __ceph_unreserve_caps(mdsc, ctx->count); ctx->count = 0; + + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + reclaim = true; spin_unlock(&mdsc->caps_list_lock); + + if (reclaim) + ceph_reclaim_caps_nr(mdsc, ctx->used); } struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, @@ -346,6 +364,7 @@ struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; + ctx->used++; mdsc->caps_reserve_count--; mdsc->caps_use_count++; @@ -500,12 +519,12 @@ static void __insert_cap_node(struct ceph_inode_info *ci, static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { - struct ceph_mount_options *ma = mdsc->fsc->mount_options; + struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_min = round_jiffies(jiffies + - ma->caps_wanted_delay_min * HZ); + opt->caps_wanted_delay_min * HZ); ci->i_hold_caps_max = round_jiffies(jiffies + - ma->caps_wanted_delay_max * HZ); + opt->caps_wanted_delay_max * HZ); dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode, ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies); } @@ -657,6 +676,10 @@ void ceph_add_cap(struct inode *inode, session->s_nr_caps++; spin_unlock(&session->s_cap_lock); } else { + spin_lock(&session->s_cap_lock); + list_move_tail(&cap->session_caps, &session->s_caps); + spin_unlock(&session->s_cap_lock); + if (cap->cap_gen < session->s_cap_gen) cap->issued = cap->implemented = CEPH_CAP_PIN; @@ -1081,9 +1104,7 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { cap->queue_release = 1; if (removed) { - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; + __ceph_queue_cap_release(session, cap); removed = 0; } } else { @@ -1245,7 +1266,7 @@ static int send_cap_msg(struct cap_msg_args *arg) * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_ceph_lock. */ -void ceph_queue_caps_release(struct inode *inode) +void __ceph_remove_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct rb_node *p; @@ -2393,6 +2414,12 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + /* encode_caps_cb() also will reset these sequence + * numbers. make sure sequence numbers in cap flush + * message match later reconnect message */ + cap->seq = 0; + cap->issue_seq = 0; + cap->mseq = 0; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } else { @@ -3880,12 +3907,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, cap->seq = seq; cap->issue_seq = seq; spin_lock(&session->s_cap_lock); - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; + __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } - goto flush_cap_releases; + goto done; } /* these will work even if we don't have a cap yet */ @@ -3955,7 +3980,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, ceph_cap_op_name(op)); } - goto done; +done: + mutex_unlock(&session->s_mutex); +done_unlocked: + iput(inode); + ceph_put_string(extra_info.pool_ns); + return; flush_cap_releases: /* @@ -3963,14 +3993,8 @@ flush_cap_releases: * along for the mds (who clearly thinks we still have this * cap). */ - ceph_send_cap_releases(mdsc, session); - -done: - mutex_unlock(&session->s_mutex); -done_unlocked: - iput(inode); - ceph_put_string(extra_info.pool_ns); - return; + ceph_flush_cap_releases(mdsc, session); + goto done; bad: pr_err("ceph_handle_caps: corrupt message\n"); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index abdf98deeec4..98365e74cb4a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -139,23 +139,6 @@ static int caps_show(struct seq_file *s, void *p) return 0; } -static int dentry_lru_show(struct seq_file *s, void *ptr) -{ - struct ceph_fs_client *fsc = s->private; - struct ceph_mds_client *mdsc = fsc->mdsc; - struct ceph_dentry_info *di; - - spin_lock(&mdsc->dentry_lru_lock); - list_for_each_entry(di, &mdsc->dentry_lru, lru) { - struct dentry *dentry = di->dentry; - seq_printf(s, "%p %p\t%pd\n", - di, dentry, dentry); - } - spin_unlock(&mdsc->dentry_lru_lock); - - return 0; -} - static int mds_sessions_show(struct seq_file *s, void *ptr) { struct ceph_fs_client *fsc = s->private; @@ -195,7 +178,6 @@ static int mds_sessions_show(struct seq_file *s, void *ptr) CEPH_DEFINE_SHOW_FUNC(mdsmap_show) CEPH_DEFINE_SHOW_FUNC(mdsc_show) CEPH_DEFINE_SHOW_FUNC(caps_show) -CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) CEPH_DEFINE_SHOW_FUNC(mds_sessions_show) @@ -231,7 +213,6 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); debugfs_remove(fsc->debugfs_mdsc); - debugfs_remove(fsc->debugfs_dentry_lru); } int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) @@ -291,14 +272,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_caps) goto out; - fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", - 0400, - fsc->client->debugfs_dir, - fsc, - &dentry_lru_show_fops); - if (!fsc->debugfs_dentry_lru) - goto out; - return 0; out: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 82928cea0209..a8f429882249 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -29,6 +29,9 @@ const struct dentry_operations ceph_dentry_ops; +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di); +static int __dir_lease_try_check(const struct dentry *dentry); + /* * Initialize ceph dentry state. */ @@ -44,7 +47,7 @@ static int ceph_d_init(struct dentry *dentry) di->lease_session = NULL; di->time = jiffies; dentry->d_fsdata = di; - ceph_dentry_lru_add(dentry); + INIT_LIST_HEAD(&di->lease_list); return 0; } @@ -241,6 +244,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, goto out; } if (fpos_cmp(ctx->pos, di->offset) <= 0) { + __ceph_dentry_dir_lease_touch(di); emit_dentry = true; } spin_unlock(&dentry->d_lock); @@ -1125,13 +1129,277 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, } /* + * Move dentry to tail of mdsc->dentry_leases list when lease is updated. + * Leases at front of the list will expire first. (Assume all leases have + * similar duration) + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc; + + dout("dentry_lease_touch %p %p '%pd'\n", di, dn, dn); + + di->flags |= CEPH_DENTRY_LEASE_LIST; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_move_tail(&di->lease_list, &mdsc->dentry_leases); + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_dir_lease_touch(struct ceph_mds_client* mdsc, + struct ceph_dentry_info *di) +{ + di->flags &= ~(CEPH_DENTRY_LEASE_LIST | CEPH_DENTRY_REFERENCED); + di->lease_gen = 0; + di->time = jiffies; + list_move_tail(&di->lease_list, &mdsc->dentry_dir_leases); +} + +/* + * When dir lease is used, add dentry to tail of mdsc->dentry_dir_leases + * list if it's not in the list, otherwise set 'referenced' flag. + * + * Called under dentry->d_lock. + */ +void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) +{ + struct dentry *dn = di->dentry; + struct ceph_mds_client *mdsc; + + dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n", + di, dn, dn, di->offset); + + if (!list_empty(&di->lease_list)) { + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + /* don't remove dentry from dentry lease list + * if its lease is valid */ + if (__dentry_lease_is_valid(di)) + return; + } else { + di->flags |= CEPH_DENTRY_REFERENCED; + return; + } + } + + if (di->flags & CEPH_DENTRY_SHRINK_LIST) { + di->flags |= CEPH_DENTRY_REFERENCED; + di->flags &= ~CEPH_DENTRY_LEASE_LIST; + return; + } + + mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + __dentry_dir_lease_touch(mdsc, di), + spin_unlock(&mdsc->dentry_list_lock); +} + +static void __dentry_lease_unlist(struct ceph_dentry_info *di) +{ + struct ceph_mds_client *mdsc; + if (di->flags & CEPH_DENTRY_SHRINK_LIST) + return; + if (list_empty(&di->lease_list)) + return; + + mdsc = ceph_sb_to_client(di->dentry->d_sb)->mdsc; + spin_lock(&mdsc->dentry_list_lock); + list_del_init(&di->lease_list); + spin_unlock(&mdsc->dentry_list_lock); +} + +enum { + KEEP = 0, + DELETE = 1, + TOUCH = 2, + STOP = 4, +}; + +struct ceph_lease_walk_control { + bool dir_lease; + bool expire_dir_lease; + unsigned long nr_to_scan; + unsigned long dir_lease_ttl; +}; + +static unsigned long +__dentry_leases_walk(struct ceph_mds_client *mdsc, + struct ceph_lease_walk_control *lwc, + int (*check)(struct dentry*, void*)) +{ + struct ceph_dentry_info *di, *tmp; + struct dentry *dentry, *last = NULL; + struct list_head* list; + LIST_HEAD(dispose); + unsigned long freed = 0; + int ret = 0; + + list = lwc->dir_lease ? &mdsc->dentry_dir_leases : &mdsc->dentry_leases; + spin_lock(&mdsc->dentry_list_lock); + list_for_each_entry_safe(di, tmp, list, lease_list) { + if (!lwc->nr_to_scan) + break; + --lwc->nr_to_scan; + + dentry = di->dentry; + if (last == dentry) + break; + + if (!spin_trylock(&dentry->d_lock)) + continue; + + if (dentry->d_lockref.count < 0) { + list_del_init(&di->lease_list); + goto next; + } + + ret = check(dentry, lwc); + if (ret & TOUCH) { + /* move it into tail of dir lease list */ + __dentry_dir_lease_touch(mdsc, di); + if (!last) + last = dentry; + } + if (ret & DELETE) { + /* stale lease */ + di->flags &= ~CEPH_DENTRY_REFERENCED; + if (dentry->d_lockref.count > 0) { + /* update_dentry_lease() will re-add + * it to lease list, or + * ceph_d_delete() will return 1 when + * last reference is dropped */ + list_del_init(&di->lease_list); + } else { + di->flags |= CEPH_DENTRY_SHRINK_LIST; + list_move_tail(&di->lease_list, &dispose); + dget_dlock(dentry); + } + } +next: + spin_unlock(&dentry->d_lock); + if (ret & STOP) + break; + } + spin_unlock(&mdsc->dentry_list_lock); + + while (!list_empty(&dispose)) { + di = list_first_entry(&dispose, struct ceph_dentry_info, + lease_list); + dentry = di->dentry; + spin_lock(&dentry->d_lock); + + list_del_init(&di->lease_list); + di->flags &= ~CEPH_DENTRY_SHRINK_LIST; + if (di->flags & CEPH_DENTRY_REFERENCED) { + spin_lock(&mdsc->dentry_list_lock); + if (di->flags & CEPH_DENTRY_LEASE_LIST) { + list_add_tail(&di->lease_list, + &mdsc->dentry_leases); + } else { + __dentry_dir_lease_touch(mdsc, di); + } + spin_unlock(&mdsc->dentry_list_lock); + } else { + freed++; + } + + spin_unlock(&dentry->d_lock); + /* ceph_d_delete() does the trick */ + dput(dentry); + } + return freed; +} + +static int __dentry_lease_check(struct dentry *dentry, void *arg) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + int ret; + + if (__dentry_lease_is_valid(di)) + return STOP; + ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) + return TOUCH; + return DELETE; +} + +static int __dir_lease_check(struct dentry *dentry, void *arg) +{ + struct ceph_lease_walk_control *lwc = arg; + struct ceph_dentry_info *di = ceph_dentry(dentry); + + int ret = __dir_lease_try_check(dentry); + if (ret == -EBUSY) + return KEEP; + if (ret > 0) { + if (time_before(jiffies, di->time + lwc->dir_lease_ttl)) + return STOP; + /* Move dentry to tail of dir lease list if we don't want + * to delete it. So dentries in the list are checked in a + * round robin manner */ + if (!lwc->expire_dir_lease) + return TOUCH; + if (dentry->d_lockref.count > 0 || + (di->flags & CEPH_DENTRY_REFERENCED)) + return TOUCH; + /* invalidate dir lease */ + di->lease_shared_gen = 0; + } + return DELETE; +} + +int ceph_trim_dentries(struct ceph_mds_client *mdsc) +{ + struct ceph_lease_walk_control lwc; + unsigned long count; + unsigned long freed; + + spin_lock(&mdsc->caps_list_lock); + if (mdsc->caps_use_max > 0 && + mdsc->caps_use_count > mdsc->caps_use_max) + count = mdsc->caps_use_count - mdsc->caps_use_max; + else + count = 0; + spin_unlock(&mdsc->caps_list_lock); + + lwc.dir_lease = false; + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2; + freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check); + if (!lwc.nr_to_scan) /* more invalid leases */ + return -EAGAIN; + + if (lwc.nr_to_scan < CEPH_CAPS_PER_RELEASE) + lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE; + + lwc.dir_lease = true; + lwc.expire_dir_lease = freed < count; + lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ; + freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check); + if (!lwc.nr_to_scan) /* more to check */ + return -EAGAIN; + + return freed > 0 ? 1 : 0; +} + +/* * Ensure a dentry lease will no longer revalidate. */ void ceph_invalidate_dentry_lease(struct dentry *dentry) { + struct ceph_dentry_info *di = ceph_dentry(dentry); spin_lock(&dentry->d_lock); - ceph_dentry(dentry)->time = jiffies; - ceph_dentry(dentry)->lease_shared_gen = 0; + di->time = jiffies; + di->lease_shared_gen = 0; + __dentry_lease_unlist(di); spin_unlock(&dentry->d_lock); } @@ -1139,45 +1407,59 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry) * Check if dentry lease is valid. If not, delete the lease. Try to * renew if the least is more than half up. */ +static bool __dentry_lease_is_valid(struct ceph_dentry_info *di) +{ + struct ceph_mds_session *session; + + if (!di->lease_gen) + return false; + + session = di->lease_session; + if (session) { + u32 gen; + unsigned long ttl; + + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + ttl = session->s_cap_ttl; + spin_unlock(&session->s_gen_ttl_lock); + + if (di->lease_gen == gen && + time_before(jiffies, ttl) && + time_before(jiffies, di->time)) + return true; + } + di->lease_gen = 0; + return false; +} + static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, struct inode *dir) { struct ceph_dentry_info *di; - struct ceph_mds_session *s; - int valid = 0; - u32 gen; - unsigned long ttl; struct ceph_mds_session *session = NULL; u32 seq = 0; + int valid = 0; spin_lock(&dentry->d_lock); di = ceph_dentry(dentry); - if (di && di->lease_session) { - s = di->lease_session; - spin_lock(&s->s_gen_ttl_lock); - gen = s->s_cap_gen; - ttl = s->s_cap_ttl; - spin_unlock(&s->s_gen_ttl_lock); + if (di && __dentry_lease_is_valid(di)) { + valid = 1; - if (di->lease_gen == gen && - time_before(jiffies, di->time) && - time_before(jiffies, ttl)) { - valid = 1; - if (di->lease_renew_after && - time_after(jiffies, di->lease_renew_after)) { - /* - * We should renew. If we're in RCU walk mode - * though, we can't do that so just return - * -ECHILD. - */ - if (flags & LOOKUP_RCU) { - valid = -ECHILD; - } else { - session = ceph_get_mds_session(s); - seq = di->lease_seq; - di->lease_renew_after = 0; - di->lease_renew_from = jiffies; - } + if (di->lease_renew_after && + time_after(jiffies, di->lease_renew_after)) { + /* + * We should renew. If we're in RCU walk mode + * though, we can't do that so just return + * -ECHILD. + */ + if (flags & LOOKUP_RCU) { + valid = -ECHILD; + } else { + session = ceph_get_mds_session(di->lease_session); + seq = di->lease_seq; + di->lease_renew_after = 0; + di->lease_renew_from = jiffies; } } } @@ -1193,6 +1475,38 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, } /* + * Called under dentry->d_lock. + */ +static int __dir_lease_try_check(const struct dentry *dentry) +{ + struct ceph_dentry_info *di = ceph_dentry(dentry); + struct inode *dir; + struct ceph_inode_info *ci; + int valid = 0; + + if (!di->lease_shared_gen) + return 0; + if (IS_ROOT(dentry)) + return 0; + + dir = d_inode(dentry->d_parent); + ci = ceph_inode(dir); + + if (spin_trylock(&ci->i_ceph_lock)) { + if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen && + __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 0)) + valid = 1; + spin_unlock(&ci->i_ceph_lock); + } else { + valid = -EBUSY; + } + + if (!valid) + di->lease_shared_gen = 0; + return valid; +} + +/* * Check if directory-wide content lease/cap is valid. */ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) @@ -1205,6 +1519,8 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) if (atomic_read(&ci->i_shared_gen) == di->lease_shared_gen) valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1); spin_unlock(&ci->i_ceph_lock); + if (valid) + __ceph_dentry_dir_lease_touch(di); dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n", dir, (unsigned)atomic_read(&ci->i_shared_gen), dentry, (unsigned)di->lease_shared_gen, valid); @@ -1297,11 +1613,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); - if (valid) { - ceph_dentry_lru_touch(dentry); - } else { + if (!valid) ceph_dir_clear_complete(dir); - } if (!(flags & LOOKUP_RCU)) dput(parent); @@ -1309,6 +1622,31 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) } /* + * Delete unused dentry that doesn't have valid lease + * + * Called under dentry->d_lock. + */ +static int ceph_d_delete(const struct dentry *dentry) +{ + struct ceph_dentry_info *di; + + /* won't release caps */ + if (d_really_is_negative(dentry)) + return 0; + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) + return 0; + /* vaild lease? */ + di = ceph_dentry(dentry); + if (di) { + if (__dentry_lease_is_valid(di)) + return 0; + if (__dir_lease_try_check(dentry)) + return 0; + } + return 1; +} + +/* * Release our ceph_dentry_info. */ static void ceph_d_release(struct dentry *dentry) @@ -1316,9 +1654,9 @@ static void ceph_d_release(struct dentry *dentry) struct ceph_dentry_info *di = ceph_dentry(dentry); dout("d_release %p\n", dentry); - ceph_dentry_lru_del(dentry); spin_lock(&dentry->d_lock); + __dentry_lease_unlist(di); dentry->d_fsdata = NULL; spin_unlock(&dentry->d_lock); @@ -1419,49 +1757,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, return size - left; } -/* - * We maintain a private dentry LRU. - * - * FIXME: this needs to be changed to a per-mds lru to be useful. - */ -void ceph_dentry_lru_add(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_add %p %p '%pd'\n", di, dn, dn); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_add_tail(&di->lru, &mdsc->dentry_lru); - mdsc->num_dentry++; - spin_unlock(&mdsc->dentry_lru_lock); -} - -void ceph_dentry_lru_touch(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_touch %p %p '%pd' (offset %lld)\n", di, dn, dn, - di->offset); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_move_tail(&di->lru, &mdsc->dentry_lru); - spin_unlock(&mdsc->dentry_lru_lock); -} -void ceph_dentry_lru_del(struct dentry *dn) -{ - struct ceph_dentry_info *di = ceph_dentry(dn); - struct ceph_mds_client *mdsc; - - dout("dentry_lru_del %p %p '%pd'\n", di, dn, dn); - mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; - spin_lock(&mdsc->dentry_lru_lock); - list_del_init(&di->lru); - mdsc->num_dentry--; - spin_unlock(&mdsc->dentry_lru_lock); -} /* * Return name hash for a given dentry. This is dependent on @@ -1531,6 +1827,7 @@ const struct inode_operations ceph_snapdir_iops = { const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, + .d_delete = ceph_d_delete, .d_release = ceph_d_release, .d_prune = ceph_d_prune, .d_init = ceph_d_init, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 189df668b6a0..9f53c3d99304 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -590,7 +590,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); + ret = filemap_write_and_wait_range(inode->i_mapping, + off, off + len - 1); if (ret < 0) return ret; @@ -929,14 +930,15 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (write ? "write" : "read"), file, pos, (unsigned)count, snapc, snapc->seq); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + ret = filemap_write_and_wait_range(inode->i_mapping, + pos, pos + count - 1); if (ret < 0) return ret; if (write) { int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, - (pos + count) >> PAGE_SHIFT); + (pos + count - 1) >> PAGE_SHIFT); if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); @@ -1132,13 +1134,14 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", file, pos, (unsigned)count, snapc, snapc->seq); - ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); + ret = filemap_write_and_wait_range(inode->i_mapping, + pos, pos + count - 1); if (ret < 0) return ret; ret = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, - (pos + count) >> PAGE_SHIFT); + (pos + count - 1) >> PAGE_SHIFT); if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 9d1f34d46627..e3346628efe2 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -497,7 +497,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; atomic_set(&ci->i_filelock_ref, 0); - atomic_set(&ci->i_shared_gen, 0); + atomic_set(&ci->i_shared_gen, 1); ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; @@ -537,7 +537,7 @@ void ceph_destroy_inode(struct inode *inode) ceph_fscache_unregister_inode_cookie(ci); - ceph_queue_caps_release(inode); + __ceph_remove_caps(inode); if (__ceph_has_any_quota(ci)) ceph_adjust_quota_realms_count(inode, false); @@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode) */ if (ci->i_snap_realm) { struct ceph_mds_client *mdsc = - ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; - struct ceph_snap_realm *realm = ci->i_snap_realm; - - dout(" dropping residual ref to snap realm %p\n", realm); - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); + ceph_inode_to_client(inode)->mdsc; + if (ceph_snap(inode) == CEPH_NOSNAP) { + struct ceph_snap_realm *realm = ci->i_snap_realm; + dout(" dropping residual ref to snap realm %p\n", + realm); + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm = NULL; + if (realm->ino == ci->i_vino.ino) + realm->inode = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, realm); + } else { + ceph_put_snapid_map(mdsc, ci->i_snapid_map); + ci->i_snap_realm = NULL; + } } kfree(ci->i_symlink); @@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, iinfo->pool_ns_len); + if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map) + ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode)); + spin_lock(&ci->i_ceph_lock); /* @@ -869,6 +877,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ci->i_rbytes = le64_to_cpu(info->rbytes); ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); + ci->i_dir_pin = iinfo->dir_pin; ceph_decode_timespec64(&ci->i_rctime, &info->rctime); } } @@ -899,6 +908,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, case S_IFBLK: case S_IFCHR: case S_IFSOCK: + inode->i_blkbits = PAGE_SHIFT; init_special_inode(inode, inode->i_mode, inode->i_rdev); inode->i_op = &ceph_file_iops; break; @@ -1066,9 +1076,10 @@ static void update_dentry_lease(struct dentry *dentry, goto out_unlock; di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen); - - if (duration == 0) + if (duration == 0) { + __ceph_dentry_dir_lease_touch(di); goto out_unlock; + } if (di->lease_gen == session->s_cap_gen && time_before(ttl, di->time)) @@ -1079,8 +1090,6 @@ static void update_dentry_lease(struct dentry *dentry, di->lease_session = NULL; } - ceph_dentry_lru_touch(dentry); - if (!di->lease_session) di->lease_session = ceph_get_mds_session(session); di->lease_gen = session->s_cap_gen; @@ -1088,6 +1097,8 @@ static void update_dentry_lease(struct dentry *dentry, di->lease_renew_after = half_ttl; di->lease_renew_from = 0; di->time = ttl; + + __ceph_dentry_lease_touch(di); out_unlock: spin_unlock(&dentry->d_lock); if (old_lease_session) @@ -2259,10 +2270,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat, if (!err) { generic_fillattr(inode, stat); stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); - if (ceph_snap(inode) != CEPH_NOSNAP) - stat->dev = ceph_snap(inode); + if (ceph_snap(inode) == CEPH_NOSNAP) + stat->dev = inode->i_sb->s_dev; else - stat->dev = 0; + stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; + if (S_ISDIR(inode->i_mode)) { if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 163fc74bf221..21c33ed048ed 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -20,6 +20,8 @@ #include <linux/ceph/auth.h> #include <linux/ceph/debugfs.h> +#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) + /* * A cluster of MDS (metadata server) daemons is responsible for * managing the file system namespace (the directory hierarchy and @@ -46,13 +48,17 @@ */ struct ceph_reconnect_state { - int nr_caps; + struct ceph_mds_session *session; + int nr_caps, nr_realms; struct ceph_pagelist *pagelist; unsigned msg_version; + bool allow_multi; }; static void __wake_requests(struct ceph_mds_client *mdsc, struct list_head *head); +static void ceph_cap_release_work(struct work_struct *work); +static void ceph_cap_reclaim_work(struct work_struct *work); static const struct ceph_connection_operations mds_con_ops; @@ -61,6 +67,29 @@ static const struct ceph_connection_operations mds_con_ops; * mds reply parsing */ +static int parse_reply_info_quota(void **p, void *end, + struct ceph_mds_reply_info_in *info) +{ + u8 struct_v, struct_compat; + u32 struct_len; + + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only + * understand encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + ceph_decode_64_safe(p, end, info->max_bytes, bad); + ceph_decode_64_safe(p, end, info->max_files, bad); + *p = end; + return 0; +bad: + return -EIO; +} + /* * parse individual inode info */ @@ -68,8 +97,24 @@ static int parse_reply_info_in(void **p, void *end, struct ceph_mds_reply_info_in *info, u64 features) { - int err = -EIO; + int err = 0; + u8 struct_v = 0; + if (features == (u64)-1) { + u32 struct_len; + u8 struct_compat; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding with struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; + } + + ceph_decode_need(p, end, sizeof(struct ceph_mds_reply_inode), bad); info->in = *p; *p += sizeof(struct ceph_mds_reply_inode) + sizeof(*info->in->fragtree.splits) * @@ -87,49 +132,136 @@ static int parse_reply_info_in(void **p, void *end, info->xattr_data = *p; *p += info->xattr_len; - if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + if (features == (u64)-1) { + /* inline data */ ceph_decode_64_safe(p, end, info->inline_version, bad); ceph_decode_32_safe(p, end, info->inline_len, bad); ceph_decode_need(p, end, info->inline_len, bad); info->inline_data = *p; *p += info->inline_len; - } else - info->inline_version = CEPH_INLINE_NONE; + /* quota */ + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + /* pool namespace */ + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + /* btime, change_attr */ + { + struct ceph_timespec btime; + u64 change_attr; + ceph_decode_need(p, end, sizeof(btime), bad); + ceph_decode_copy(p, &btime, sizeof(btime)); + ceph_decode_64_safe(p, end, change_attr, bad); + } + + /* dir pin */ + if (struct_v >= 2) { + ceph_decode_32_safe(p, end, info->dir_pin, bad); + } else { + info->dir_pin = -ENODATA; + } + + *p = end; + } else { + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { + ceph_decode_64_safe(p, end, info->inline_version, bad); + ceph_decode_32_safe(p, end, info->inline_len, bad); + ceph_decode_need(p, end, info->inline_len, bad); + info->inline_data = *p; + *p += info->inline_len; + } else + info->inline_version = CEPH_INLINE_NONE; + + if (features & CEPH_FEATURE_MDS_QUOTA) { + err = parse_reply_info_quota(p, end, info); + if (err < 0) + goto out_bad; + } else { + info->max_bytes = 0; + info->max_files = 0; + } + + info->pool_ns_len = 0; + info->pool_ns_data = NULL; + if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + if (info->pool_ns_len > 0) { + ceph_decode_need(p, end, info->pool_ns_len, bad); + info->pool_ns_data = *p; + *p += info->pool_ns_len; + } + } - if (features & CEPH_FEATURE_MDS_QUOTA) { + info->dir_pin = -ENODATA; + } + return 0; +bad: + err = -EIO; +out_bad: + return err; +} + +static int parse_reply_info_dir(void **p, void *end, + struct ceph_mds_reply_dirfrag **dirfrag, + u64 features) +{ + if (features == (u64)-1) { u8 struct_v, struct_compat; u32 struct_len; - - /* - * both struct_v and struct_compat are expected to be >= 1 - */ ceph_decode_8_safe(p, end, struct_v, bad); ceph_decode_8_safe(p, end, struct_compat, bad); - if (!struct_v || !struct_compat) + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) goto bad; ceph_decode_32_safe(p, end, struct_len, bad); ceph_decode_need(p, end, struct_len, bad); - ceph_decode_64_safe(p, end, info->max_bytes, bad); - ceph_decode_64_safe(p, end, info->max_files, bad); - } else { - info->max_bytes = 0; - info->max_files = 0; + end = *p + struct_len; } - info->pool_ns_len = 0; - info->pool_ns_data = NULL; - if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { - ceph_decode_32_safe(p, end, info->pool_ns_len, bad); - if (info->pool_ns_len > 0) { - ceph_decode_need(p, end, info->pool_ns_len, bad); - info->pool_ns_data = *p; - *p += info->pool_ns_len; - } + ceph_decode_need(p, end, sizeof(**dirfrag), bad); + *dirfrag = *p; + *p += sizeof(**dirfrag) + sizeof(u32) * le32_to_cpu((*dirfrag)->ndist); + if (unlikely(*p > end)) + goto bad; + if (features == (u64)-1) + *p = end; + return 0; +bad: + return -EIO; +} + +static int parse_reply_info_lease(void **p, void *end, + struct ceph_mds_reply_lease **lease, + u64 features) +{ + if (features == (u64)-1) { + u8 struct_v, struct_compat; + u32 struct_len; + ceph_decode_8_safe(p, end, struct_v, bad); + ceph_decode_8_safe(p, end, struct_compat, bad); + /* struct_v is expected to be >= 1. we only understand + * encoding whose struct_compat == 1. */ + if (!struct_v || struct_compat != 1) + goto bad; + ceph_decode_32_safe(p, end, struct_len, bad); + ceph_decode_need(p, end, struct_len, bad); + end = *p + struct_len; } + ceph_decode_need(p, end, sizeof(**lease), bad); + *lease = *p; + *p += sizeof(**lease); + if (features == (u64)-1) + *p = end; return 0; bad: - return err; + return -EIO; } /* @@ -147,20 +279,18 @@ static int parse_reply_info_trace(void **p, void *end, if (err < 0) goto out_bad; - if (unlikely(*p + sizeof(*info->dirfrag) > end)) - goto bad; - info->dirfrag = *p; - *p += sizeof(*info->dirfrag) + - sizeof(u32)*le32_to_cpu(info->dirfrag->ndist); - if (unlikely(*p > end)) - goto bad; + err = parse_reply_info_dir(p, end, &info->dirfrag, features); + if (err < 0) + goto out_bad; ceph_decode_32_safe(p, end, info->dname_len, bad); ceph_decode_need(p, end, info->dname_len, bad); info->dname = *p; *p += info->dname_len; - info->dlease = *p; - *p += sizeof(*info->dlease); + + err = parse_reply_info_lease(p, end, &info->dlease, features); + if (err < 0) + goto out_bad; } if (info->head->is_target) { @@ -183,20 +313,16 @@ out_bad: /* * parse readdir results */ -static int parse_reply_info_dir(void **p, void *end, +static int parse_reply_info_readdir(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { u32 num, i = 0; int err; - info->dir_dir = *p; - if (*p + sizeof(*info->dir_dir) > end) - goto bad; - *p += sizeof(*info->dir_dir) + - sizeof(u32)*le32_to_cpu(info->dir_dir->ndist); - if (*p > end) - goto bad; + err = parse_reply_info_dir(p, end, &info->dir_dir, features); + if (err < 0) + goto out_bad; ceph_decode_need(p, end, sizeof(num) + 2, bad); num = ceph_decode_32(p); @@ -222,15 +348,16 @@ static int parse_reply_info_dir(void **p, void *end, while (num) { struct ceph_mds_reply_dir_entry *rde = info->dir_entries + i; /* dentry */ - ceph_decode_need(p, end, sizeof(u32)*2, bad); - rde->name_len = ceph_decode_32(p); + ceph_decode_32_safe(p, end, rde->name_len, bad); ceph_decode_need(p, end, rde->name_len, bad); rde->name = *p; *p += rde->name_len; dout("parsed dir dname '%.*s'\n", rde->name_len, rde->name); - rde->lease = *p; - *p += sizeof(struct ceph_mds_reply_lease); + /* dentry lease */ + err = parse_reply_info_lease(p, end, &rde->lease, features); + if (err) + goto out_bad; /* inode */ err = parse_reply_info_in(p, end, &rde->inode, features); if (err < 0) @@ -281,7 +408,8 @@ static int parse_reply_info_create(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { - if (features & CEPH_FEATURE_REPLY_CREATE_INODE) { + if (features == (u64)-1 || + (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { if (*p == end) { info->has_create_ino = false; } else { @@ -310,7 +438,7 @@ static int parse_reply_info_extra(void **p, void *end, if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) - return parse_reply_info_dir(p, end, info, features); + return parse_reply_info_readdir(p, end, info, features); else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); else @@ -494,7 +622,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); spin_lock_init(&s->s_gen_ttl_lock); - s->s_cap_gen = 0; + s->s_cap_gen = 1; s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); @@ -510,6 +638,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); + INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); + INIT_LIST_HEAD(&s->s_cap_flushing); mdsc->sessions[mds] = s; @@ -535,6 +665,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc, dout("__unregister_session mds%d %p\n", s->s_mds, s); BUG_ON(mdsc->sessions[s->s_mds] != s); mdsc->sessions[s->s_mds] = NULL; + s->s_state = 0; ceph_con_close(&s->s_con); ceph_put_mds_session(s); atomic_dec(&mdsc->num_sessions); @@ -1197,13 +1328,10 @@ static int iterate_session_caps(struct ceph_mds_session *session, cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; - if (cap->queue_release) { - list_add_tail(&cap->session_caps, - &session->s_cap_releases); - session->s_num_cap_releases++; - } else { + if (cap->queue_release) + __ceph_queue_cap_release(session, cap); + else old_cap = cap; /* put_cap it w/o locks held */ - } } if (ret < 0) goto out; @@ -1638,7 +1766,7 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, session->s_trim_caps = 0; } - ceph_send_cap_releases(mdsc, session); + ceph_flush_cap_releases(mdsc, session); return 0; } @@ -1681,8 +1809,8 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, /* * called under s_mutex */ -void ceph_send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static void ceph_send_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) { struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; @@ -1774,6 +1902,81 @@ out_err: spin_unlock(&session->s_cap_lock); } +static void ceph_cap_release_work(struct work_struct *work) +{ + struct ceph_mds_session *session = + container_of(work, struct ceph_mds_session, s_cap_release_work); + + mutex_lock(&session->s_mutex); + if (session->s_state == CEPH_MDS_SESSION_OPEN || + session->s_state == CEPH_MDS_SESSION_HUNG) + ceph_send_cap_releases(session->s_mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); +} + +void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + if (mdsc->stopping) + return; + + get_session(session); + if (queue_work(mdsc->fsc->cap_wq, + &session->s_cap_release_work)) { + dout("cap release work queued\n"); + } else { + ceph_put_mds_session(session); + dout("failed to queue cap release work\n"); + } +} + +/* + * caller holds session->s_cap_lock + */ +void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap) +{ + list_add_tail(&cap->session_caps, &session->s_cap_releases); + session->s_num_cap_releases++; + + if (!(session->s_num_cap_releases % CEPH_CAPS_PER_RELEASE)) + ceph_flush_cap_releases(session->s_mdsc, session); +} + +static void ceph_cap_reclaim_work(struct work_struct *work) +{ + struct ceph_mds_client *mdsc = + container_of(work, struct ceph_mds_client, cap_reclaim_work); + int ret = ceph_trim_dentries(mdsc); + if (ret == -EAGAIN) + ceph_queue_cap_reclaim_work(mdsc); +} + +void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc) +{ + if (mdsc->stopping) + return; + + if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_reclaim_work)) { + dout("caps reclaim work queued\n"); + } else { + dout("failed to queue caps release work\n"); + } +} + +void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr) +{ + int val; + if (!nr) + return; + val = atomic_add_return(nr, &mdsc->cap_reclaim_pending); + if (!(val % CEPH_CAPS_PER_RELEASE)) { + atomic_set(&mdsc->cap_reclaim_pending, 0); + ceph_queue_cap_reclaim_work(mdsc); + } +} + /* * requests */ @@ -2653,7 +2856,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) dout("handle_reply tid %lld result %d\n", tid, result); rinfo = &req->r_reply_info; - err = parse_reply_info(msg, rinfo, session->s_con.peer_features); + if (test_bit(CEPHFS_FEATURE_REPLY_ENCODING, &session->s_features)) + err = parse_reply_info(msg, rinfo, (u64)-1); + else + err = parse_reply_info(msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); @@ -2684,7 +2890,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || req->r_op == CEPH_MDS_OP_LSSNAP)) ceph_readdir_prepopulate(req, req->r_session); - ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } current->journal_info = NULL; mutex_unlock(&req->r_fill_mutex); @@ -2693,12 +2898,18 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) if (realm) ceph_put_snap_realm(mdsc, realm); - if (err == 0 && req->r_target_inode && - test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { - struct ceph_inode_info *ci = ceph_inode(req->r_target_inode); - spin_lock(&ci->i_unsafe_lock); - list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops); - spin_unlock(&ci->i_unsafe_lock); + if (err == 0) { + if (req->r_target_inode && + test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { + struct ceph_inode_info *ci = + ceph_inode(req->r_target_inode); + spin_lock(&ci->i_unsafe_lock); + list_add_tail(&req->r_unsafe_target_item, + &ci->i_unsafe_iops); + spin_unlock(&ci->i_unsafe_lock); + } + + ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } out_err: mutex_lock(&mdsc->mutex); @@ -2777,6 +2988,25 @@ bad: pr_err("mdsc_handle_forward decode error err=%d\n", err); } +static int __decode_and_drop_session_metadata(void **p, void *end) +{ + /* map<string,string> */ + u32 n; + ceph_decode_32_safe(p, end, n, bad); + while (n-- > 0) { + u32 len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + *p += len; + ceph_decode_32_safe(p, end, len, bad); + ceph_decode_need(p, end, len, bad); + *p += len; + } + return 0; +bad: + return -1; +} + /* * handle a mds session control message */ @@ -2784,18 +3014,36 @@ static void handle_session(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; + int mds = session->s_mds; + int msg_version = le16_to_cpu(msg->hdr.version); + void *p = msg->front.iov_base; + void *end = p + msg->front.iov_len; + struct ceph_mds_session_head *h; u32 op; u64 seq; - int mds = session->s_mds; - struct ceph_mds_session_head *h = msg->front.iov_base; + unsigned long features = 0; int wake = 0; /* decode */ - if (msg->front.iov_len < sizeof(*h)) - goto bad; + ceph_decode_need(&p, end, sizeof(*h), bad); + h = p; + p += sizeof(*h); + op = le32_to_cpu(h->op); seq = le64_to_cpu(h->seq); + if (msg_version >= 3) { + u32 len; + /* version >= 2, metadata */ + if (__decode_and_drop_session_metadata(&p, end) < 0) + goto bad; + /* version >= 3, feature bits */ + ceph_decode_32_safe(&p, end, len, bad); + ceph_decode_need(&p, end, len, bad); + memcpy(&features, p, min_t(size_t, len, sizeof(features))); + p += len; + } + mutex_lock(&mdsc->mutex); if (op == CEPH_SESSION_CLOSE) { get_session(session); @@ -2821,6 +3069,7 @@ static void handle_session(struct ceph_mds_session *session, if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect success\n", session->s_mds); session->s_state = CEPH_MDS_SESSION_OPEN; + session->s_features = features; renewed_caps(mdsc, session, 0); wake = 1; if (mdsc->stopping) @@ -2947,6 +3196,82 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); } +static int send_reconnect_partial(struct ceph_reconnect_state *recon_state) +{ + struct ceph_msg *reply; + struct ceph_pagelist *_pagelist; + struct page *page; + __le32 *addr; + int err = -ENOMEM; + + if (!recon_state->allow_multi) + return -ENOSPC; + + /* can't handle message that contains both caps and realm */ + BUG_ON(!recon_state->nr_caps == !recon_state->nr_realms); + + /* pre-allocate new pagelist */ + _pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!_pagelist) + return -ENOMEM; + + reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); + if (!reply) + goto fail_msg; + + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + + if (recon_state->nr_caps) { + /* currently encoding caps */ + err = ceph_pagelist_encode_32(recon_state->pagelist, 0); + if (err) + goto fail; + } else { + /* placeholder for nr_realms (currently encoding relams) */ + err = ceph_pagelist_encode_32(_pagelist, 0); + if (err < 0) + goto fail; + } + + err = ceph_pagelist_encode_8(recon_state->pagelist, 1); + if (err) + goto fail; + + page = list_first_entry(&recon_state->pagelist->head, struct page, lru); + addr = kmap_atomic(page); + if (recon_state->nr_caps) { + /* currently encoding caps */ + *addr = cpu_to_le32(recon_state->nr_caps); + } else { + /* currently encoding relams */ + *(addr + 1) = cpu_to_le32(recon_state->nr_realms); + } + kunmap_atomic(addr); + + reply->hdr.version = cpu_to_le16(5); + reply->hdr.compat_version = cpu_to_le16(4); + + reply->hdr.data_len = cpu_to_le32(recon_state->pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state->pagelist); + + ceph_con_send(&recon_state->session->s_con, reply); + ceph_pagelist_release(recon_state->pagelist); + + recon_state->pagelist = _pagelist; + recon_state->nr_caps = 0; + recon_state->nr_realms = 0; + recon_state->msg_version = 5; + return 0; +fail: + ceph_msg_put(reply); +fail_msg: + ceph_pagelist_release(_pagelist); + return err; +} + /* * Encode information about a cap for a reconnect with the MDS. */ @@ -2966,9 +3291,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, dout(" adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); - err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); - if (err) - return err; spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ @@ -3008,7 +3330,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; struct ceph_filelock *flocks = NULL; - size_t struct_len, total_len = 0; + size_t struct_len, total_len = sizeof(u64); u8 struct_v = 0; encode_again: @@ -3043,7 +3365,7 @@ encode_again: if (recon_state->msg_version >= 3) { /* version, compat_version and struct_len */ - total_len = 2 * sizeof(u8) + sizeof(u32); + total_len += 2 * sizeof(u8) + sizeof(u32); struct_v = 2; } /* @@ -3060,12 +3382,19 @@ encode_again: struct_len += sizeof(u64); /* snap_follows */ total_len += struct_len; - err = ceph_pagelist_reserve(pagelist, total_len); - if (err) { - kfree(flocks); - goto out_err; + + if (pagelist->length + total_len > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto out_freeflocks; + pagelist = recon_state->pagelist; } + err = ceph_pagelist_reserve(pagelist, total_len); + if (err) + goto out_freeflocks; + + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); if (recon_state->msg_version >= 3) { ceph_pagelist_encode_8(pagelist, struct_v); ceph_pagelist_encode_8(pagelist, 1); @@ -3077,7 +3406,7 @@ encode_again: num_fcntl_locks, num_flock_locks); if (struct_v >= 2) ceph_pagelist_encode_64(pagelist, snap_follows); - +out_freeflocks: kfree(flocks); } else { u64 pathbase = 0; @@ -3098,20 +3427,81 @@ encode_again: } err = ceph_pagelist_reserve(pagelist, - pathlen + sizeof(u32) + sizeof(rec.v1)); + sizeof(u64) + sizeof(u32) + + pathlen + sizeof(rec.v1)); if (err) { - kfree(path); - goto out_err; + goto out_freepath; } + ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); - +out_freepath: kfree(path); } - recon_state->nr_caps++; out_err: + if (err >= 0) + recon_state->nr_caps++; + return err; +} + +static int encode_snap_realms(struct ceph_mds_client *mdsc, + struct ceph_reconnect_state *recon_state) +{ + struct rb_node *p; + struct ceph_pagelist *pagelist = recon_state->pagelist; + int err = 0; + + if (recon_state->msg_version >= 4) { + err = ceph_pagelist_encode_32(pagelist, mdsc->num_snap_realms); + if (err < 0) + goto fail; + } + + /* + * snaprealms. we provide mds with the ino, seq (version), and + * parent for all of our realms. If the mds has any newer info, + * it will tell us. + */ + for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { + struct ceph_snap_realm *realm = + rb_entry(p, struct ceph_snap_realm, node); + struct ceph_mds_snaprealm_reconnect sr_rec; + + if (recon_state->msg_version >= 4) { + size_t need = sizeof(u8) * 2 + sizeof(u32) + + sizeof(sr_rec); + + if (pagelist->length + need > RECONNECT_MAX_SIZE) { + err = send_reconnect_partial(recon_state); + if (err) + goto fail; + pagelist = recon_state->pagelist; + } + + err = ceph_pagelist_reserve(pagelist, need); + if (err) + goto fail; + + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_8(pagelist, 1); + ceph_pagelist_encode_32(pagelist, sizeof(sr_rec)); + } + + dout(" adding snap realm %llx seq %lld parent %llx\n", + realm->ino, realm->seq, realm->parent_ino); + sr_rec.ino = cpu_to_le64(realm->ino); + sr_rec.seq = cpu_to_le64(realm->seq); + sr_rec.parent = cpu_to_le64(realm->parent_ino); + + err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); + if (err) + goto fail; + + recon_state->nr_realms++; + } +fail: return err; } @@ -3132,18 +3522,17 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_msg *reply; - struct rb_node *p; int mds = session->s_mds; int err = -ENOMEM; - int s_nr_caps; - struct ceph_pagelist *pagelist; - struct ceph_reconnect_state recon_state; + struct ceph_reconnect_state recon_state = { + .session = session, + }; LIST_HEAD(dispose); pr_info("mds%d reconnect start\n", mds); - pagelist = ceph_pagelist_alloc(GFP_NOFS); - if (!pagelist) + recon_state.pagelist = ceph_pagelist_alloc(GFP_NOFS); + if (!recon_state.pagelist) goto fail_nopagelist; reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); @@ -3187,63 +3576,90 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, /* replay unsafe requests */ replay_unsafe_requests(mdsc, session); + ceph_early_kick_flushing_caps(mdsc, session); + down_read(&mdsc->snap_rwsem); - /* traverse this session's caps */ - s_nr_caps = session->s_nr_caps; - err = ceph_pagelist_encode_32(pagelist, s_nr_caps); + /* placeholder for nr_caps */ + err = ceph_pagelist_encode_32(recon_state.pagelist, 0); if (err) goto fail; - recon_state.nr_caps = 0; - recon_state.pagelist = pagelist; - if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) + if (test_bit(CEPHFS_FEATURE_MULTI_RECONNECT, &session->s_features)) { recon_state.msg_version = 3; - else + recon_state.allow_multi = true; + } else if (session->s_con.peer_features & CEPH_FEATURE_MDSENC) { + recon_state.msg_version = 3; + } else { recon_state.msg_version = 2; + } + /* trsaverse this session's caps */ err = iterate_session_caps(session, encode_caps_cb, &recon_state); - if (err < 0) - goto fail; spin_lock(&session->s_cap_lock); session->s_cap_reconnect = 0; spin_unlock(&session->s_cap_lock); - /* - * snaprealms. we provide mds with the ino, seq (version), and - * parent for all of our realms. If the mds has any newer info, - * it will tell us. - */ - for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) { - struct ceph_snap_realm *realm = - rb_entry(p, struct ceph_snap_realm, node); - struct ceph_mds_snaprealm_reconnect sr_rec; + if (err < 0) + goto fail; - dout(" adding snap realm %llx seq %lld parent %llx\n", - realm->ino, realm->seq, realm->parent_ino); - sr_rec.ino = cpu_to_le64(realm->ino); - sr_rec.seq = cpu_to_le64(realm->seq); - sr_rec.parent = cpu_to_le64(realm->parent_ino); - err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); - if (err) - goto fail; + /* check if all realms can be encoded into current message */ + if (mdsc->num_snap_realms) { + size_t total_len = + recon_state.pagelist->length + + mdsc->num_snap_realms * + sizeof(struct ceph_mds_snaprealm_reconnect); + if (recon_state.msg_version >= 4) { + /* number of realms */ + total_len += sizeof(u32); + /* version, compat_version and struct_len */ + total_len += mdsc->num_snap_realms * + (2 * sizeof(u8) + sizeof(u32)); + } + if (total_len > RECONNECT_MAX_SIZE) { + if (!recon_state.allow_multi) { + err = -ENOSPC; + goto fail; + } + if (recon_state.nr_caps) { + err = send_reconnect_partial(&recon_state); + if (err) + goto fail; + } + recon_state.msg_version = 5; + } } - reply->hdr.version = cpu_to_le16(recon_state.msg_version); + err = encode_snap_realms(mdsc, &recon_state); + if (err < 0) + goto fail; - /* raced with cap release? */ - if (s_nr_caps != recon_state.nr_caps) { - struct page *page = list_first_entry(&pagelist->head, - struct page, lru); + if (recon_state.msg_version >= 5) { + err = ceph_pagelist_encode_8(recon_state.pagelist, 0); + if (err < 0) + goto fail; + } + + if (recon_state.nr_caps || recon_state.nr_realms) { + struct page *page = + list_first_entry(&recon_state.pagelist->head, + struct page, lru); __le32 *addr = kmap_atomic(page); - *addr = cpu_to_le32(recon_state.nr_caps); + if (recon_state.nr_caps) { + WARN_ON(recon_state.nr_realms != mdsc->num_snap_realms); + *addr = cpu_to_le32(recon_state.nr_caps); + } else if (recon_state.msg_version >= 4) { + *(addr + 1) = cpu_to_le32(recon_state.nr_realms); + } kunmap_atomic(addr); } - reply->hdr.data_len = cpu_to_le32(pagelist->length); - ceph_msg_data_add_pagelist(reply, pagelist); + reply->hdr.version = cpu_to_le16(recon_state.msg_version); + if (recon_state.msg_version >= 4) + reply->hdr.compat_version = cpu_to_le16(4); - ceph_early_kick_flushing_caps(mdsc, session); + reply->hdr.data_len = cpu_to_le32(recon_state.pagelist->length); + ceph_msg_data_add_pagelist(reply, recon_state.pagelist); ceph_con_send(&session->s_con, reply); @@ -3254,7 +3670,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, mutex_unlock(&mdsc->mutex); up_read(&mdsc->snap_rwsem); - ceph_pagelist_release(pagelist); + ceph_pagelist_release(recon_state.pagelist); return; fail: @@ -3262,7 +3678,7 @@ fail: up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); fail_nomsg: - ceph_pagelist_release(pagelist); + ceph_pagelist_release(recon_state.pagelist); fail_nopagelist: pr_err("error %d preparing reconnect for mds%d\n", err, mds); return; @@ -3580,7 +3996,6 @@ static void delayed_work(struct work_struct *work) int renew_caps; dout("mdsc delayed_work\n"); - ceph_check_delayed_caps(mdsc); mutex_lock(&mdsc->mutex); renew_interval = mdsc->mdsmap->m_session_timeout >> 2; @@ -3628,6 +4043,12 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); + ceph_check_delayed_caps(mdsc); + + ceph_queue_cap_reclaim_work(mdsc); + + ceph_trim_snapid_map(mdsc); + schedule_delayed(mdsc); } @@ -3660,6 +4081,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); + mdsc->num_snap_realms = 0; spin_lock_init(&mdsc->snap_empty_lock); mdsc->last_tid = 0; mdsc->oldest_tid = 0; @@ -3677,11 +4099,19 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); - spin_lock_init(&mdsc->dentry_lru_lock); - INIT_LIST_HEAD(&mdsc->dentry_lru); + INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); + atomic_set(&mdsc->cap_reclaim_pending, 0); + + spin_lock_init(&mdsc->dentry_list_lock); + INIT_LIST_HEAD(&mdsc->dentry_leases); + INIT_LIST_HEAD(&mdsc->dentry_dir_leases); ceph_caps_init(mdsc); - ceph_adjust_min_caps(mdsc, fsc->min_caps); + ceph_adjust_caps_max_min(mdsc, fsc->mount_options); + + spin_lock_init(&mdsc->snapid_map_lock); + mdsc->snapid_map_tree = RB_ROOT; + INIT_LIST_HEAD(&mdsc->snapid_map_lru); init_rwsem(&mdsc->pool_perm_rwsem); mdsc->pool_perm_tree = RB_ROOT; @@ -3876,8 +4306,10 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) WARN_ON(!list_empty(&mdsc->cap_delay_list)); mutex_unlock(&mdsc->mutex); + ceph_cleanup_snapid_map(mdsc); ceph_cleanup_empty_realms(mdsc); + cancel_work_sync(&mdsc->cap_reclaim_work); cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ dout("stopped\n"); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 729da155ebf0..50385a481fdb 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -21,11 +21,14 @@ #define CEPHFS_FEATURE_REPLY_ENCODING 9 #define CEPHFS_FEATURE_RECLAIM_CLIENT 10 #define CEPHFS_FEATURE_LAZY_CAP_WANTED 11 +#define CEPHFS_FEATURE_MULTI_RECONNECT 12 #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ + CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \ + CEPHFS_FEATURE_MULTI_RECONNECT, \ } #define CEPHFS_FEATURES_CLIENT_REQUIRED {} @@ -65,6 +68,7 @@ struct ceph_mds_reply_info_in { char *pool_ns_data; u64 max_bytes; u64 max_files; + s32 dir_pin; }; struct ceph_mds_reply_dir_entry { @@ -152,6 +156,7 @@ struct ceph_mds_session { int s_mds; int s_state; unsigned long s_ttl; /* time until mds kills us */ + unsigned long s_features; u64 s_seq; /* incoming msg seq # */ struct mutex s_mutex; /* serialize session messages */ @@ -167,19 +172,20 @@ struct ceph_mds_session { /* protected by s_cap_lock */ spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ + struct ceph_cap *s_cap_iterator; int s_nr_caps, s_trim_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ - struct ceph_cap *s_cap_iterator; + struct work_struct s_cap_release_work; /* protected by mutex */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; - refcount_t s_ref; + refcount_t s_ref; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ }; @@ -310,6 +316,15 @@ struct ceph_pool_perm { char pool_ns[]; }; +struct ceph_snapid_map { + struct rb_node node; + struct list_head lru; + atomic_t ref; + u64 snap; + dev_t dev; + unsigned long last_used; +}; + /* * mds client state */ @@ -341,6 +356,7 @@ struct ceph_mds_client { struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; + int num_snap_realms; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ @@ -362,6 +378,9 @@ struct ceph_mds_client { spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; + struct work_struct cap_reclaim_work; + atomic_t cap_reclaim_pending; + /* * Cap reservations * @@ -378,13 +397,18 @@ struct ceph_mds_client { unreserved) */ int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ + int caps_use_max; /* max used caps */ int caps_reserve_count; /* unused, reserved */ int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ - spinlock_t dentry_lru_lock; - struct list_head dentry_lru; - int num_dentry; + spinlock_t dentry_list_lock; + struct list_head dentry_leases; /* fifo list */ + struct list_head dentry_dir_leases; /* lru list */ + + spinlock_t snapid_map_lock; + struct rb_root snapid_map_tree; + struct list_head snapid_map_lru; struct rw_semaphore pool_perm_rwsem; struct rb_root pool_perm_tree; @@ -438,9 +462,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } -extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); - +extern void __ceph_queue_cap_release(struct ceph_mds_session *session, + struct ceph_cap *cap); +extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); +extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); +extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index f74193da0e09..89aa37fa0f84 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -3,12 +3,13 @@ #include <linux/sort.h> #include <linux/slab.h> - #include "super.h" #include "mds_client.h" - #include <linux/ceph/decode.h> +/* unused map expires after 5 minutes */ +#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) + /* * Snapshots in ceph are driven in large part by cooperation from the * client. In contrast to local file systems or file servers that @@ -124,6 +125,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm( INIT_LIST_HEAD(&realm->inodes_with_caps); spin_lock_init(&realm->inodes_with_caps_lock); __insert_snap_realm(&mdsc->snap_realms, realm); + mdsc->num_snap_realms++; + dout("create_snap_realm %llx %p\n", realm->ino, realm); return realm; } @@ -175,6 +178,7 @@ static void __destroy_snap_realm(struct ceph_mds_client *mdsc, dout("__destroy_snap_realm %p %llx\n", realm, realm->ino); rb_erase(&realm->node, &mdsc->snap_realms); + mdsc->num_snap_realms--; if (realm->parent) { list_del_init(&realm->child_item); @@ -986,3 +990,154 @@ out: up_write(&mdsc->snap_rwsem); return; } + +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap) +{ + struct ceph_snapid_map *sm, *exist; + struct rb_node **p, *parent; + int ret; + + exist = NULL; + spin_lock(&mdsc->snapid_map_lock); + p = &mdsc->snapid_map_tree.rb_node; + while (*p) { + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) { + p = &(*p)->rb_left; + } else if (snap < exist->snap) { + p = &(*p)->rb_right; + } else { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + break; + } + exist = NULL; + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + return exist; + } + + sm = kmalloc(sizeof(*sm), GFP_NOFS); + if (!sm) + return NULL; + + ret = get_anon_bdev(&sm->dev); + if (ret < 0) { + kfree(sm); + return NULL; + } + + INIT_LIST_HEAD(&sm->lru); + atomic_set(&sm->ref, 1); + sm->snap = snap; + + exist = NULL; + parent = NULL; + p = &mdsc->snapid_map_tree.rb_node; + spin_lock(&mdsc->snapid_map_lock); + while (*p) { + parent = *p; + exist = rb_entry(*p, struct ceph_snapid_map, node); + if (snap > exist->snap) + p = &(*p)->rb_left; + else if (snap < exist->snap) + p = &(*p)->rb_right; + else + break; + exist = NULL; + } + if (exist) { + if (atomic_inc_return(&exist->ref) == 1) + list_del_init(&exist->lru); + } else { + rb_link_node(&sm->node, parent, p); + rb_insert_color(&sm->node, &mdsc->snapid_map_tree); + } + spin_unlock(&mdsc->snapid_map_lock); + if (exist) { + free_anon_bdev(sm->dev); + kfree(sm); + dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); + return exist; + } + + dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); + return sm; +} + +void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm) +{ + if (!sm) + return; + if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { + if (!RB_EMPTY_NODE(&sm->node)) { + sm->last_used = jiffies; + list_add_tail(&sm->lru, &mdsc->snapid_map_lru); + spin_unlock(&mdsc->snapid_map_lock); + } else { + /* already cleaned up by + * ceph_cleanup_snapid_map() */ + spin_unlock(&mdsc->snapid_map_lock); + kfree(sm); + } + } +} + +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_snapid_map *sm; + unsigned long now; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + now = jiffies; + + while (!list_empty(&mdsc->snapid_map_lru)) { + sm = list_first_entry(&mdsc->snapid_map_lru, + struct ceph_snapid_map, lru); + if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) + break; + + rb_erase(&sm->node, &mdsc->snapid_map_tree); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev); + free_anon_bdev(sm->dev); + kfree(sm); + } +} + +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) +{ + struct ceph_snapid_map *sm; + struct rb_node *p; + LIST_HEAD(to_free); + + spin_lock(&mdsc->snapid_map_lock); + while ((p = rb_first(&mdsc->snapid_map_tree))) { + sm = rb_entry(p, struct ceph_snapid_map, node); + rb_erase(p, &mdsc->snapid_map_tree); + RB_CLEAR_NODE(p); + list_move(&sm->lru, &to_free); + } + spin_unlock(&mdsc->snapid_map_lock); + + while (!list_empty(&to_free)) { + sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); + list_del(&sm->lru); + free_anon_bdev(sm->dev); + if (WARN_ON_ONCE(atomic_read(&sm->ref))) { + pr_err("snapid map %llx -> %x still in use\n", + sm->snap, sm->dev); + } + } +} diff --git a/fs/ceph/super.c b/fs/ceph/super.c index da2cd8e89062..6d5bb2f74612 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -133,6 +133,7 @@ enum { Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, + Opt_caps_max, Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, @@ -175,6 +176,7 @@ static match_table_t fsopt_tokens = { {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, + {Opt_caps_max, "caps_max=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, @@ -286,6 +288,11 @@ static int parse_fsopt_token(char *c, void *private) return -EINVAL; fsopt->caps_wanted_delay_max = intval; break; + case Opt_caps_max: + if (intval < 0) + return -EINVAL; + fsopt->caps_max = intval; + break; case Opt_readdir_max_entries: if (intval < 1) return -EINVAL; @@ -576,6 +583,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",rasize=%d", fsopt->rasize); if (fsopt->congestion_kb != default_congestion_kb()) seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); + if (fsopt->caps_max) + seq_printf(m, ",caps_max=%d", fsopt->caps_max); if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) seq_printf(m, ",caps_wanted_delay_min=%d", fsopt->caps_wanted_delay_min); @@ -671,6 +680,9 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); if (!fsc->trunc_wq) goto fail_pg_inv_wq; + fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + if (!fsc->cap_wq) + goto fail_trunc_wq; /* set up mempools */ err = -ENOMEM; @@ -678,13 +690,12 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, size = sizeof (struct page *) * (page_count ? page_count : 1); fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size); if (!fsc->wb_pagevec_pool) - goto fail_trunc_wq; - - /* caps */ - fsc->min_caps = fsopt->max_readdir; + goto fail_cap_wq; return fsc; +fail_cap_wq: + destroy_workqueue(fsc->cap_wq); fail_trunc_wq: destroy_workqueue(fsc->trunc_wq); fail_pg_inv_wq: @@ -706,6 +717,7 @@ static void flush_fs_workqueues(struct ceph_fs_client *fsc) flush_workqueue(fsc->wb_wq); flush_workqueue(fsc->pg_inv_wq); flush_workqueue(fsc->trunc_wq); + flush_workqueue(fsc->cap_wq); } static void destroy_fs_client(struct ceph_fs_client *fsc) @@ -715,6 +727,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_workqueue(fsc->wb_wq); destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); + destroy_workqueue(fsc->cap_wq); mempool_destroy(fsc->wb_pagevec_pool); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dfb64a5211b6..16c03188578e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -79,6 +79,7 @@ struct ceph_mount_options { int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; + int caps_max; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ @@ -100,17 +101,18 @@ struct ceph_fs_client { struct ceph_client *client; unsigned long mount_state; - int min_caps; /* min caps i added */ loff_t max_file_size; struct ceph_mds_client *mdsc; /* writeback */ mempool_t *wb_pagevec_pool; + atomic_long_t writeback_count; + struct workqueue_struct *wb_wq; struct workqueue_struct *pg_inv_wq; struct workqueue_struct *trunc_wq; - atomic_long_t writeback_count; + struct workqueue_struct *cap_wq; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; @@ -260,17 +262,22 @@ struct ceph_inode_xattr { * Ceph dentry state */ struct ceph_dentry_info { + struct dentry *dentry; struct ceph_mds_session *lease_session; + struct list_head lease_list; + unsigned flags; int lease_shared_gen; u32 lease_gen; u32 lease_seq; unsigned long lease_renew_after, lease_renew_from; - struct list_head lru; - struct dentry *dentry; unsigned long time; u64 offset; }; +#define CEPH_DENTRY_REFERENCED 1 +#define CEPH_DENTRY_LEASE_LIST 2 +#define CEPH_DENTRY_SHRINK_LIST 4 + struct ceph_inode_xattrs_info { /* * (still encoded) xattr blob. we avoid the overhead of parsing @@ -318,6 +325,8 @@ struct ceph_inode_info { /* quotas */ u64 i_max_bytes, i_max_files; + s32 i_dir_pin; + struct rb_root i_fragtree; int i_fragtree_nsplits; struct mutex i_fragtree_mutex; @@ -370,7 +379,10 @@ struct ceph_inode_info { struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ spinlock_t i_unsafe_lock; - struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + union { + struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ + struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ + }; int i_snap_realm_counter; /* snap realm (if caps) */ struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; @@ -587,7 +599,7 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, struct ceph_inode_frag *pfrag, int *found); -static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) +static inline struct ceph_dentry_info *ceph_dentry(const struct dentry *dentry) { return (struct ceph_dentry_info *)dentry->d_fsdata; } @@ -656,7 +668,8 @@ extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check); extern void ceph_caps_init(struct ceph_mds_client *mdsc); extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); -extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); +extern void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, + struct ceph_mount_options *fsopt); extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, @@ -837,6 +850,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, + u64 snap); +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, + struct ceph_snapid_map *sm); +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); + + /* * a cap_snap is "pending" if it is still awaiting an in-progress * sync write (that may/may not still update size, mtime, etc.). @@ -975,11 +996,11 @@ extern void ceph_add_cap(struct inode *inode, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void __ceph_remove_caps(struct inode* inode); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); -extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); @@ -1049,10 +1070,10 @@ extern int ceph_handle_snapdir(struct ceph_mds_request *req, extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); -extern void ceph_dentry_lru_add(struct dentry *dn); -extern void ceph_dentry_lru_touch(struct dentry *dn); -extern void ceph_dentry_lru_del(struct dentry *dn); +extern void __ceph_dentry_lease_touch(struct ceph_dentry_info *di); +extern void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern int ceph_trim_dentries(struct ceph_mds_client *mdsc); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 316f6ad10644..0cc42c8879e9 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -228,8 +228,19 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, ci->i_rctime.tv_nsec); } -/* quotas */ +/* dir pin */ +static bool ceph_vxattrcb_dir_pin_exists(struct ceph_inode_info *ci) +{ + return ci->i_dir_pin != -ENODATA; +} + +static size_t ceph_vxattrcb_dir_pin(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return snprintf(val, size, "%d", (int)ci->i_dir_pin); +} +/* quotas */ static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) { bool ret = false; @@ -315,6 +326,13 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rbytes), XATTR_RSTAT_FIELD(dir, rctime), { + .name = "ceph.dir.pin", + .name_size = sizeof("ceph.dir_pin"), + .getxattr_cb = ceph_vxattrcb_dir_pin, + .exists_cb = ceph_vxattrcb_dir_pin_exists, + .flags = VXATTR_FLAG_HIDDEN, + }, + { .name = "ceph.quota", .name_size = sizeof("ceph.quota"), .getxattr_cb = ceph_vxattrcb_quota, diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index e92a2fee3c57..13c1288b04a7 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -115,7 +115,12 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_puts(m, " type: CDROM "); else seq_printf(m, " type: %d ", dev_type); - if (tcon->seal) + + seq_printf(m, "Serial Number: 0x%x", tcon->vol_serial_number); + + if ((tcon->seal) || + (tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) || + (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)) seq_printf(m, " Encrypted"); if (tcon->nocase) seq_printf(m, " nocase"); @@ -371,6 +376,10 @@ skip_rdma: atomic_read(&server->in_send), atomic_read(&server->num_waiters)); #endif + if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) + seq_puts(m, " encrypted"); + if (ses->sign) + seq_puts(m, " signed"); seq_puts(m, "\n\tShares:"); j = 0; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index d8bce2f862de..086ddc5108af 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -43,6 +43,9 @@ struct smb_snapshot_array { /* snapshots[]; */ } __packed; +/* query_info flags */ +#define PASSTHRU_QUERY_INFO 0x00000000 +#define PASSTHRU_FSCTL 0x00000001 struct smb_query_info { __u32 info_type; __u32 file_info_class; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f293e052e351..38feae812b47 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -479,6 +479,14 @@ struct smb_version_operations { struct cifs_tcon *tcon, __le16 *path, int is_dir, unsigned long p); + /* make unix special files (block, char, fifo, socket) */ + int (*make_node)(unsigned int xid, + struct inode *inode, + struct dentry *dentry, + struct cifs_tcon *tcon, + char *full_path, + umode_t mode, + dev_t device_number); }; struct smb_version_values { @@ -735,13 +743,13 @@ in_flight(struct TCP_Server_Info *server) } static inline bool -has_credits(struct TCP_Server_Info *server, int *credits) +has_credits(struct TCP_Server_Info *server, int *credits, int num_credits) { int num; spin_lock(&server->req_lock); num = *credits; spin_unlock(&server->req_lock); - return num > 0; + return num >= num_credits; } static inline void @@ -962,11 +970,14 @@ cap_unix(struct cifs_ses *ses) struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ + bool file_all_info_is_valid:1; + struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; struct cifs_tcon *tcon; struct work_struct lease_break; + struct smb2_file_all_info file_all_info; }; /* @@ -1735,6 +1746,7 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers + * list operations on global DnotifyReqList * tcp_ses_lock protects: * list operations on tcp and SMB session lists * tcon->open_file_lock protects the list of open files hanging off the tcon diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b95db2b593cb..a8e9738db691 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1191,10 +1191,6 @@ next_pdu: continue; } - if (server->large_buf) - buf = server->bigbuf; - - server->lstrp = jiffies; for (i = 0; i < num_mids; i++) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 907e85d65bb4..f26a48dd2e39 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -621,20 +621,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, { int rc = -EPERM; unsigned int xid; - int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - struct cifs_io_parms io_parms; char *full_path = NULL; - struct inode *newinode = NULL; - __u32 oplock = 0; - struct cifs_fid fid; - struct cifs_open_parms oparms; - FILE_ALL_INFO *buf = NULL; - unsigned int bytes_written; - struct win_dev *pdev; - struct kvec iov[2]; if (!old_valid_dev(device_number)) return -EINVAL; @@ -654,103 +644,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, goto mknod_out; } - if (tcon->unix_ext) { - struct cifs_unix_set_info_args args = { - .mode = mode & ~current_umask(), - .ctime = NO_CHANGE_64, - .atime = NO_CHANGE_64, - .mtime = NO_CHANGE_64, - .device = device_number, - }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = current_fsuid(); - args.gid = current_fsgid(); - } else { - args.uid = INVALID_UID; /* no change */ - args.gid = INVALID_GID; /* no change */ - } - rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); - if (rc) - goto mknod_out; - - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb, xid); - - if (rc == 0) - d_instantiate(direntry, newinode); - goto mknod_out; - } - - if (!S_ISCHR(mode) && !S_ISBLK(mode)) - goto mknod_out; - - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) - goto mknod_out; - - - cifs_dbg(FYI, "sfu compat create special file\n"); - - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - rc = -ENOMEM; - goto mknod_out; - } - - if (backup_cred(cifs_sb)) - create_options |= CREATE_OPEN_BACKUP_INTENT; - - oparms.tcon = tcon; - oparms.cifs_sb = cifs_sb; - oparms.desired_access = GENERIC_WRITE; - oparms.create_options = create_options; - oparms.disposition = FILE_CREATE; - oparms.path = full_path; - oparms.fid = &fid; - oparms.reconnect = false; - - if (tcon->ses->server->oplocks) - oplock = REQ_OPLOCK; - else - oplock = 0; - rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); - if (rc) - goto mknod_out; - - /* - * BB Do not bother to decode buf since no local inode yet to put - * timestamps in, but we can reuse it safely. - */ - - pdev = (struct win_dev *)buf; - io_parms.pid = current->tgid; - io_parms.tcon = tcon; - io_parms.offset = 0; - io_parms.length = sizeof(struct win_dev); - iov[1].iov_base = buf; - iov[1].iov_len = sizeof(struct win_dev); - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = cpu_to_le64(MAJOR(device_number)); - pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = cpu_to_le64(MAJOR(device_number)); - pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, - &bytes_written, iov, 1); - } - tcon->ses->server->ops->close(xid, tcon, &fid); - d_drop(direntry); - - /* FIXME: add code here to set EAs */ + rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon, + full_path, mode, + device_number); mknod_out: kfree(full_path); - kfree(buf); free_xid(xid); cifs_put_tlink(tlink); return rc; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4c144c1f50eb..2a6d20c0ce02 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1645,8 +1645,20 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX && !rc) + if (flock->fl_flags & FL_POSIX) { + /* + * If this is a request to remove all locks because we + * are closing the file, it doesn't matter if the + * unlocking failed as both cifs.ko and the SMB server + * remove the lock on file close + */ + if (rc) { + cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc); + if (!(flock->fl_flags & FL_CLOSE)) + return rc; + } rc = locks_lock_file_wait(file, flock); + } return rc; } diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index f0ce27c3c6e4..c711f1f39bf2 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1027,6 +1027,131 @@ cifs_can_echo(struct TCP_Server_Info *server) return false; } +static int +cifs_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct inode *newinode = NULL; + int rc = -EPERM; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + FILE_ALL_INFO *buf = NULL; + struct cifs_io_parms io_parms; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + if (tcon->unix_ext) { + /* + * SMB1 Unix Extensions: requires server support but + * works with all special files + */ + struct cifs_unix_set_info_args args = { + .mode = mode & ~current_umask(), + .ctime = NO_CHANGE_64, + .atime = NO_CHANGE_64, + .mtime = NO_CHANGE_64, + .device = dev, + }; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = current_fsuid(); + args.gid = current_fsgid(); + } else { + args.uid = INVALID_UID; /* no change */ + args.gid = INVALID_GID; /* no change */ + } + rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_remap(cifs_sb)); + if (rc) + goto out; + + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + + if (rc == 0) + d_instantiate(dentry, newinode); + goto out; + } + + /* + * SMB1 SFU emulation: should work with all servers, but only + * support block and char device (no socket & fifo) + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto out; + + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto out; + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(dentry); + + /* FIXME: add code here to set EAs */ +out: + kfree(buf); + return rc; +} + + + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1110,6 +1235,7 @@ struct smb_version_operations smb1_operations = { .get_acl_by_fid = get_cifs_acl_by_fid, .set_acl = set_cifs_acl, #endif /* CIFS_ACL */ + .make_node = cifs_make_node, }; struct smb_version_values smb1_values = { diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 01a76bccdb8d..278405d26c47 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -37,6 +37,16 @@ #include "smb2pdu.h" #include "smb2proto.h" +static void +free_set_inf_compound(struct smb_rqst *rqst) +{ + if (rqst[1].rq_iov) + SMB2_set_info_free(&rqst[1]); + if (rqst[2].rq_iov) + SMB2_close_free(&rqst[2]); +} + + static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, @@ -112,14 +122,18 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, PATH_MAX * 2, 0, NULL); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid, + full_path); break; case SMB2_OP_DELETE: + trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_MKDIR: /* * Directories are created through parameters in the * SMB2_open() call. */ + trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_RMDIR: memset(&si_iov, 0, sizeof(si_iov)); @@ -135,6 +149,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_EOF: memset(&si_iov, 0, sizeof(si_iov)); @@ -150,6 +165,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_SET_INFO: memset(&si_iov, 0, sizeof(si_iov)); @@ -166,6 +182,8 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid, + full_path); break; case SMB2_OP_RENAME: memset(&si_iov, 0, sizeof(si_iov)); @@ -190,6 +208,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path); break; case SMB2_OP_HARDLINK: memset(&si_iov, 0, sizeof(si_iov)); @@ -214,6 +233,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_O_INFO_FILE, 0, data, size); smb2_set_next_command(tcon, &rqst[num_rqst]); smb2_set_related(&rqst[num_rqst++]); + trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path); break; default: cifs_dbg(VFS, "Invalid command\n"); @@ -252,21 +272,65 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, SMB2_query_info_free(&rqst[1]); if (rqst[2].rq_iov) SMB2_close_free(&rqst[2]); + if (rc) + trace_smb3_query_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_query_info_compound_done(xid, ses->Suid, + tcon->tid); break; case SMB2_OP_DELETE: + if (rc) + trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_delete_done(xid, ses->Suid, tcon->tid); + if (rqst[1].rq_iov) + SMB2_close_free(&rqst[1]); + break; case SMB2_OP_MKDIR: + if (rc) + trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid); if (rqst[1].rq_iov) SMB2_close_free(&rqst[1]); break; case SMB2_OP_HARDLINK: + if (rc) + trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_RENAME: + if (rc) + trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rename_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_RMDIR: + if (rc) + trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_SET_EOF: + if (rc) + trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc); + else + trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid); + free_set_inf_compound(rqst); + break; case SMB2_OP_SET_INFO: - if (rqst[1].rq_iov) - SMB2_set_info_free(&rqst[1]); - if (rqst[2].rq_iov) - SMB2_close_free(&rqst[2]); + if (rc) + trace_smb3_set_info_compound_err(xid, ses->Suid, + tcon->tid, rc); + else + trace_smb3_set_info_compound_done(xid, ses->Suid, + tcon->tid); + free_set_inf_compound(rqst); break; } free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); @@ -309,12 +373,17 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, rc = open_shroot(xid, tcon, &fid); if (rc) goto out; - rc = SMB2_query_info(xid, tcon, fid.persistent_fid, - fid.volatile_fid, smb2_data); + + if (tcon->crfid.file_all_info_is_valid) { + move_smb2_info_to_cifs(data, + &tcon->crfid.file_all_info); + } else { + rc = SMB2_query_info(xid, tcon, fid.persistent_fid, + fid.volatile_fid, smb2_data); + if (!rc) + move_smb2_info_to_cifs(data, smb2_data); + } close_shroot(&tcon->crfid); - if (rc) - goto out; - move_smb2_info_to_cifs(data, smb2_data); goto out; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 085e91436da7..1022a3771e14 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -185,7 +185,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); rc = wait_event_killable(server->request_q, - has_credits(server, &server->credits)); + has_credits(server, &server->credits, 1)); cifs_num_waiters_dec(server); if (rc) return rc; @@ -619,6 +619,7 @@ smb2_close_cached_fid(struct kref *ref) SMB2_close(0, cfid->tcon, cfid->fid->persistent_fid, cfid->fid->volatile_fid); cfid->is_valid = false; + cfid->file_all_info_is_valid = false; } } @@ -643,9 +644,18 @@ smb2_cached_lease_break(struct work_struct *work) */ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) { - struct cifs_open_parms oparams; - int rc; - __le16 srch_path = 0; /* Null - since an open of top of share */ + struct cifs_ses *ses = tcon->ses; + struct TCP_Server_Info *server = ses->server; + struct cifs_open_parms oparms; + struct smb2_create_rsp *o_rsp = NULL; + struct smb2_query_info_rsp *qi_rsp = NULL; + int resp_buftype[2]; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qi_iov[1]; + int rc, flags = 0; + __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; mutex_lock(&tcon->crfid.fid_mutex); @@ -657,22 +667,89 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) return 0; } - oparams.tcon = tcon; - oparams.create_options = 0; - oparams.desired_access = FILE_READ_ATTRIBUTES; - oparams.disposition = FILE_OPEN; - oparams.fid = pfid; - oparams.reconnect = false; - - rc = SMB2_open(xid, &oparams, &srch_path, &oplock, NULL, NULL, NULL); - if (rc == 0) { - memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); - tcon->crfid.tcon = tcon; - tcon->crfid.is_valid = true; - kref_init(&tcon->crfid.refcount); - kref_get(&tcon->crfid.refcount); - } + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + + oparms.tcon = tcon; + oparms.create_options = 0; + oparms.desired_access = FILE_READ_ATTRIBUTES; + oparms.disposition = FILE_OPEN; + oparms.fid = pfid; + oparms.reconnect = false; + + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, &utf16_path); + if (rc) + goto oshr_exit; + smb2_set_next_command(tcon, &rqst[0]); + + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, FILE_ALL_INFORMATION, + SMB2_O_INFO_FILE, 0, + sizeof(struct smb2_file_all_info) + + PATH_MAX * 2, 0, NULL); + if (rc) + goto oshr_exit; + + smb2_set_related(&rqst[1]); + + rc = compound_send_recv(xid, ses, flags, 2, rqst, + resp_buftype, rsp_iov); + if (rc) + goto oshr_exit; + + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; + oparms.fid->persistent_fid = o_rsp->PersistentFileId; + oparms.fid->volatile_fid = o_rsp->VolatileFileId; +#ifdef CONFIG_CIFS_DEBUG2 + oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); +#endif /* CIFS_DEBUG2 */ + + if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) + oplock = smb2_parse_lease_state(server, o_rsp, + &oparms.fid->epoch, + oparms.fid->lease_key); + else + goto oshr_exit; + + + memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); + tcon->crfid.tcon = tcon; + tcon->crfid.is_valid = true; + kref_init(&tcon->crfid.refcount); + kref_get(&tcon->crfid.refcount); + + + qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(qi_rsp->OutputBufferLength) < sizeof(struct smb2_file_all_info)) + goto oshr_exit; + rc = smb2_validate_and_copy_iov( + le16_to_cpu(qi_rsp->OutputBufferOffset), + sizeof(struct smb2_file_all_info), + &rsp_iov[1], sizeof(struct smb2_file_all_info), + (char *)&tcon->crfid.file_all_info); + if (rc) + goto oshr_exit; + tcon->crfid.file_all_info_is_valid = 1; + + oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); + SMB2_open_free(&rqst[0]); + SMB2_query_info_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } @@ -1253,7 +1330,8 @@ smb2_ioctl_query_info(const unsigned int xid, struct smb_query_info __user *pqi; int rc = 0; int flags = 0; - struct smb2_query_info_rsp *rsp = NULL; + struct smb2_query_info_rsp *qi_rsp = NULL; + struct smb2_ioctl_rsp *io_rsp = NULL; void *buffer = NULL; struct smb_rqst rqst[3]; int resp_buftype[3]; @@ -1263,6 +1341,7 @@ smb2_ioctl_query_info(const unsigned int xid, u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_fid fid; struct kvec qi_iov[1]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; struct kvec close_iov[1]; memset(rqst, 0, sizeof(rqst)); @@ -1313,15 +1392,35 @@ smb2_ioctl_query_info(const unsigned int xid, smb2_set_next_command(tcon, &rqst[0]); /* Query */ - memset(&qi_iov, 0, sizeof(qi_iov)); - rqst[1].rq_iov = qi_iov; - rqst[1].rq_nvec = 1; - - rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, COMPOUND_FID, - qi.file_info_class, qi.info_type, - qi.additional_information, + if (qi.flags & PASSTHRU_FSCTL) { + /* Can eventually relax perm check since server enforces too */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EPERM; + else { + memset(&io_iov, 0, sizeof(io_iov)); + rqst[1].rq_iov = io_iov; + rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, &rqst[1], + COMPOUND_FID, COMPOUND_FID, + qi.info_type, true, NULL, + 0); + } + } else if (qi.flags == PASSTHRU_QUERY_INFO) { + memset(&qi_iov, 0, sizeof(qi_iov)); + rqst[1].rq_iov = qi_iov; + rqst[1].rq_nvec = 1; + + rc = SMB2_query_info_init(tcon, &rqst[1], COMPOUND_FID, + COMPOUND_FID, qi.file_info_class, + qi.info_type, qi.additional_information, qi.input_buffer_length, qi.output_buffer_length, buffer); + } else { /* unknown flags */ + cifs_dbg(VFS, "invalid passthru query flags: 0x%x\n", qi.flags); + rc = -EINVAL; + } + if (rc) goto iqinf_exit; smb2_set_next_command(tcon, &rqst[1]); @@ -1341,24 +1440,44 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; - pqi = (struct smb_query_info __user *)arg; - rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; - if (le32_to_cpu(rsp->OutputBufferLength) < qi.input_buffer_length) - qi.input_buffer_length = le32_to_cpu(rsp->OutputBufferLength); - if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, - sizeof(qi.input_buffer_length))) { - rc = -EFAULT; - goto iqinf_exit; - } - if (copy_to_user(pqi + 1, rsp->Buffer, qi.input_buffer_length)) { - rc = -EFAULT; - goto iqinf_exit; + if (qi.flags & PASSTHRU_FSCTL) { + pqi = (struct smb_query_info __user *)arg; + io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(io_rsp->OutputCount) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, &io_rsp[1], qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } + } else { + pqi = (struct smb_query_info __user *)arg; + qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base; + if (le32_to_cpu(qi_rsp->OutputBufferLength) < qi.input_buffer_length) + qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength); + if (copy_to_user(&pqi->input_buffer_length, &qi.input_buffer_length, + sizeof(qi.input_buffer_length))) { + rc = -EFAULT; + goto iqinf_exit; + } + if (copy_to_user(pqi + 1, qi_rsp->Buffer, qi.input_buffer_length)) { + rc = -EFAULT; + goto iqinf_exit; + } } iqinf_exit: kfree(buffer); SMB2_open_free(&rqst[0]); - SMB2_query_info_free(&rqst[1]); + if (qi.flags & PASSTHRU_FSCTL) + SMB2_ioctl_free(&rqst[1]); + else + SMB2_query_info_free(&rqst[1]); + SMB2_close_free(&rqst[2]); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); @@ -2472,22 +2591,38 @@ get_smb2_acl(struct cifs_sb_info *cifs_sb, static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, loff_t offset, loff_t len, bool keep_size) { + struct cifs_ses *ses = tcon->ses; struct inode *inode; struct cifsInodeInfo *cifsi; struct cifsFileInfo *cfile = file->private_data; struct file_zero_data_information fsctl_buf; + struct smb_rqst rqst[2]; + int resp_buftype[2]; + struct kvec rsp_iov[2]; + struct kvec io_iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec si_iov[1]; + unsigned int size[1]; + void *data[1]; long rc; unsigned int xid; + int num = 0, flags = 0; + __le64 eof; xid = get_xid(); inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); + trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); + + /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) if (keep_size == false) { rc = -EOPNOTSUPP; + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, + tcon->tid, ses->Suid, offset, len, rc); free_xid(xid); return rc; } @@ -2498,33 +2633,73 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, */ if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)) { rc = -EOPNOTSUPP; + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len, rc); free_xid(xid); return rc; } - /* - * need to make sure we are not asked to extend the file since the SMB3 - * fsctl does not change the file size. In the future we could change - * this to zero the first part of the range then set the file size - * which for a non sparse file would zero the newly extended range - */ - if (keep_size == false) - if (i_size_read(inode) < offset + len) { - rc = -EOPNOTSUPP; - free_xid(xid); - return rc; - } - cifs_dbg(FYI, "offset %lld len %lld", offset, len); fsctl_buf.FileOffset = cpu_to_le64(offset); fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len); - rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, - true /* is_fctl */, (char *)&fsctl_buf, - sizeof(struct file_zero_data_information), NULL, NULL); + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + + memset(&io_iov, 0, sizeof(io_iov)); + rqst[num].rq_iov = io_iov; + rqst[num].rq_nvec = SMB2_IOCTL_IOV_SIZE; + rc = SMB2_ioctl_init(tcon, &rqst[num++], cfile->fid.persistent_fid, + cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, + true /* is_fctl */, (char *)&fsctl_buf, + sizeof(struct file_zero_data_information)); + if (rc) + goto zero_range_exit; + + /* + * do we also need to change the size of the file? + */ + if (keep_size == false && i_size_read(inode) < offset + len) { + smb2_set_next_command(tcon, &rqst[0]); + + memset(&si_iov, 0, sizeof(si_iov)); + rqst[num].rq_iov = si_iov; + rqst[num].rq_nvec = 1; + + eof = cpu_to_le64(offset + len); + size[0] = 8; /* sizeof __le64 */ + data[0] = &eof; + + rc = SMB2_set_info_init(tcon, &rqst[num++], + cfile->fid.persistent_fid, + cfile->fid.persistent_fid, + current->tgid, + FILE_END_OF_FILE_INFORMATION, + SMB2_O_INFO_FILE, 0, data, size); + smb2_set_related(&rqst[1]); + } + + rc = compound_send_recv(xid, ses, flags, num, rqst, + resp_buftype, rsp_iov); + + zero_range_exit: + SMB2_ioctl_free(&rqst[0]); + SMB2_set_info_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_xid(xid); + if (rc) + trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len, rc); + else + trace_smb3_zero_done(xid, cfile->fid.persistent_fid, tcon->tid, + ses->Suid, offset, len); return rc; } @@ -2573,15 +2748,20 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, struct cifsFileInfo *cfile = file->private_data; long rc = -EOPNOTSUPP; unsigned int xid; + __le64 eof; xid = get_xid(); inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); + trace_smb3_falloc_enter(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len); /* if file not oplocked can't be sure whether asking to extend size */ if (!CIFS_CACHE_READ(cifsi)) if (keep_size == false) { + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); free_xid(xid); return rc; } @@ -2601,6 +2781,12 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, /* BB: in future add else clause to extend file */ else rc = -EOPNOTSUPP; + if (rc) + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); + else + trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len); free_xid(xid); return rc; } @@ -2616,14 +2802,31 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, */ if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) { rc = -EOPNOTSUPP; + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, + tcon->tid, tcon->ses->Suid, off, len, rc); free_xid(xid); return rc; } - rc = smb2_set_sparse(xid, tcon, cfile, inode, false); + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + } else { + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + if (i_size_read(inode) < off + len) { + eof = cpu_to_le64(off + len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, + &eof); + } } - /* BB: else ... in future add code to extend file and set sparse */ + if (rc) + trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len, rc); + else + trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, tcon->tid, + tcon->ses->Suid, off, len); free_xid(xid); return rc; @@ -3604,6 +3807,104 @@ smb2_next_header(char *buf) return le32_to_cpu(hdr->NextCommand); } +static int +smb2_make_node(unsigned int xid, struct inode *inode, + struct dentry *dentry, struct cifs_tcon *tcon, + char *full_path, umode_t mode, dev_t dev) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + int rc = -EPERM; + int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL; + FILE_ALL_INFO *buf = NULL; + struct cifs_io_parms io_parms; + __u32 oplock = 0; + struct cifs_fid fid; + struct cifs_open_parms oparms; + unsigned int bytes_written; + struct win_dev *pdev; + struct kvec iov[2]; + + /* + * Check if mounted with mount parm 'sfu' mount parm. + * SFU emulation should work with all servers, but only + * supports block and char device (no socket & fifo), + * and was used by default in earlier versions of Windows + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto out; + + /* + * TODO: Add ability to create instead via reparse point. Windows (e.g. + * their current NFS server) uses this approach to expose special files + * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions + */ + + if (!S_ISCHR(mode) && !S_ISBLK(mode)) + goto out; + + cifs_dbg(FYI, "sfu compat create special file\n"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + rc = -ENOMEM; + goto out; + } + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.path = full_path; + oparms.fid = &fid; + oparms.reconnect = false; + + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); + if (rc) + goto out; + + /* + * BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely. + */ + + pdev = (struct win_dev *)buf; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = cpu_to_le64(MAJOR(dev)); + pdev->minor = cpu_to_le64(MINOR(dev)); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); + } + tcon->ses->server->ops->close(xid, tcon, &fid); + d_drop(dentry); + + /* FIXME: add code here to set EAs */ +out: + kfree(buf); + return rc; +} + + struct smb_version_operations smb20_operations = { .compare_fids = smb2_compare_fids, .setup_request = smb2_setup_request, @@ -3698,6 +3999,7 @@ struct smb_version_operations smb20_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb21_operations = { @@ -3796,6 +4098,7 @@ struct smb_version_operations smb21_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb30_operations = { @@ -3903,6 +4206,7 @@ struct smb_version_operations smb30_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_operations smb311_operations = { @@ -4011,6 +4315,7 @@ struct smb_version_operations smb311_operations = { #endif /* CIFS_ACL */ .next_header = smb2_next_header, .ioctl_query_info = smb2_ioctl_query_info, + .make_node = smb2_make_node, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 60fbe306f604..c399e09b76e6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1797,9 +1797,10 @@ create_reconnect_durable_buf(struct cifs_fid *fid) return buf; } -static __u8 -parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp, - unsigned int *epoch, char *lease_key) +__u8 +smb2_parse_lease_state(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key) { char *data_offset; struct create_context *cc; @@ -2456,8 +2457,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) - *oplock = parse_lease_state(server, rsp, &oparms->fid->epoch, - oparms->fid->lease_key); + *oplock = smb2_parse_lease_state(server, rsp, + &oparms->fid->epoch, + oparms->fid->lease_key); else *oplock = rsp->OplockLevel; creat_exit: @@ -2466,65 +2468,46 @@ creat_exit: return rc; } -/* - * SMB2 IOCTL is used for both IOCTLs and FSCTLs - */ int -SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, - u64 volatile_fid, u32 opcode, bool is_fsctl, - char *in_data, u32 indatalen, - char **out_data, u32 *plen /* returned data len */) +SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen) { - struct smb_rqst rqst; struct smb2_ioctl_req *req; - struct smb2_ioctl_rsp *rsp; - struct cifs_ses *ses; - struct kvec iov[2]; - struct kvec rsp_iov; - int resp_buftype; - int n_iov; - int rc = 0; - int flags = 0; + struct kvec *iov = rqst->rq_iov; unsigned int total_len; - - cifs_dbg(FYI, "SMB2 IOCTL\n"); - - if (out_data != NULL) - *out_data = NULL; - - /* zero out returned data len, in case of error */ - if (plen) - *plen = 0; - - if (tcon) - ses = tcon->ses; - else - return -EIO; - - if (!ses || !(ses->server)) - return -EIO; + int rc; rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len); if (rc) return rc; - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - req->CtlCode = cpu_to_le32(opcode); req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; + iov[0].iov_base = (char *)req; + /* + * If no input data, the size of ioctl struct in + * protocol spec still includes a 1 byte data buffer, + * but if input data passed to ioctl, we do not + * want to double count this, so we do not send + * the dummy one byte of data in iovec[0] if sending + * input data (in iovec[1]). + */ if (indatalen) { req->InputCount = cpu_to_le32(indatalen); /* do not set InputOffset if no input data */ req->InputOffset = cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer)); + rqst->rq_nvec = 2; + iov[0].iov_len = total_len - 1; iov[1].iov_base = in_data; iov[1].iov_len = indatalen; - n_iov = 2; - } else - n_iov = 1; + } else { + rqst->rq_nvec = 1; + iov[0].iov_len = total_len; + } req->OutputOffset = 0; req->OutputCount = 0; /* MBZ */ @@ -2546,33 +2529,70 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, else req->Flags = 0; - iov[0].iov_base = (char *)req; - - /* - * If no input data, the size of ioctl struct in - * protocol spec still includes a 1 byte data buffer, - * but if input data passed to ioctl, we do not - * want to double count this, so we do not send - * the dummy one byte of data in iovec[0] if sending - * input data (in iovec[1]). - */ - - if (indatalen) { - iov[0].iov_len = total_len - 1; - } else - iov[0].iov_len = total_len; - /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + return 0; +} + +void +SMB2_ioctl_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ +} + +/* + * SMB2 IOCTL is used for both IOCTLs and FSCTLs + */ +int +SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, + u64 volatile_fid, u32 opcode, bool is_fsctl, + char *in_data, u32 indatalen, + char **out_data, u32 *plen /* returned data len */) +{ + struct smb_rqst rqst; + struct smb2_ioctl_rsp *rsp = NULL; + struct cifs_ses *ses; + struct kvec iov[SMB2_IOCTL_IOV_SIZE]; + struct kvec rsp_iov = {NULL, 0}; + int resp_buftype = CIFS_NO_BUFFER; + int rc = 0; + int flags = 0; + + cifs_dbg(FYI, "SMB2 IOCTL\n"); + + if (out_data != NULL) + *out_data = NULL; + + /* zero out returned data len, in case of error */ + if (plen) + *plen = 0; + + if (tcon) + ses = tcon->ses; + else + return -EIO; + + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + memset(&rqst, 0, sizeof(struct smb_rqst)); + memset(&iov, 0, sizeof(iov)); rqst.rq_iov = iov; - rqst.rq_nvec = n_iov; + rqst.rq_nvec = SMB2_IOCTL_IOV_SIZE; + + rc = SMB2_ioctl_init(tcon, &rqst, persistent_fid, volatile_fid, + opcode, is_fsctl, in_data, indatalen); + if (rc) + goto ioctl_exit; rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base; if (rc != 0) @@ -2622,6 +2642,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, } ioctl_exit: + SMB2_ioctl_free(&rqst); free_rsp_buf(resp_buftype, rsp); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 0bd4d4802701..ee8977688e21 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -959,6 +959,13 @@ struct duplicate_extents_to_file { __le64 ByteCount; /* Bytes to be copied */ } __packed; +/* + * Maximum number of iovs we need for an ioctl request. + * [0] : struct smb2_ioctl_req + * [1] : in_data + */ +#define SMB2_IOCTL_IOV_SIZE 2 + struct smb2_ioctl_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 57 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 87733b27a65f..3c32d0cfea69 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -144,6 +144,10 @@ extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 opcode, bool is_fsctl, char *in_data, u32 indatalen, char **out_data, u32 *plen /* returned data len */); +extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, u32 opcode, + bool is_fsctl, char *in_data, u32 indatalen); +extern void SMB2_ioctl_free(struct smb_rqst *rqst); extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, @@ -223,6 +227,9 @@ extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *); extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *, enum securityEnum); +extern __u8 smb2_parse_lease_state(struct TCP_Server_Info *server, + struct smb2_create_rsp *rsp, + unsigned int *epoch, char *lease_key); extern int smb3_encryption_required(const struct cifs_tcon *tcon); extern int smb2_validate_iov(unsigned int offset, unsigned int buffer_length, struct kvec *iov, unsigned int min_buf_size); diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h index 3d5f62150de4..447c0c6e4c64 100644 --- a/fs/cifs/smb2status.h +++ b/fs/cifs/smb2status.h @@ -30,9 +30,9 @@ */ #define STATUS_SEVERITY_SUCCESS __constant_cpu_to_le32(0x0000) -#define STATUS_SEVERITY_INFORMATIONAL __constanst_cpu_to_le32(0x0001) -#define STATUS_SEVERITY_WARNING __constanst_cpu_to_le32(0x0002) -#define STATUS_SEVERITY_ERROR __constanst_cpu_to_le32(0x0003) +#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001) +#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002) +#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003) struct ntstatus { /* Facility is the high 12 bits of the following field */ diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index d8b049afa606..fa226de48ef3 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -59,6 +59,8 @@ DEFINE_EVENT(smb3_rw_err_class, smb3_##name, \ DEFINE_SMB3_RW_ERR_EVENT(write_err); DEFINE_SMB3_RW_ERR_EVENT(read_err); DEFINE_SMB3_RW_ERR_EVENT(query_dir_err); +DEFINE_SMB3_RW_ERR_EVENT(zero_err); +DEFINE_SMB3_RW_ERR_EVENT(falloc_err); /* For logging successful read or write */ @@ -104,9 +106,13 @@ DEFINE_EVENT(smb3_rw_done_class, smb3_##name, \ DEFINE_SMB3_RW_DONE_EVENT(write_enter); DEFINE_SMB3_RW_DONE_EVENT(read_enter); DEFINE_SMB3_RW_DONE_EVENT(query_dir_enter); +DEFINE_SMB3_RW_DONE_EVENT(zero_enter); +DEFINE_SMB3_RW_DONE_EVENT(falloc_enter); DEFINE_SMB3_RW_DONE_EVENT(write_done); DEFINE_SMB3_RW_DONE_EVENT(read_done); DEFINE_SMB3_RW_DONE_EVENT(query_dir_done); +DEFINE_SMB3_RW_DONE_EVENT(zero_done); +DEFINE_SMB3_RW_DONE_EVENT(falloc_done); /* * For handle based calls other than read and write, and get/set info @@ -242,6 +248,123 @@ DEFINE_SMB3_INF_ERR_EVENT(query_info_err); DEFINE_SMB3_INF_ERR_EVENT(set_info_err); DEFINE_SMB3_INF_ERR_EVENT(fsctl_err); +DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + const char *full_path), + TP_ARGS(xid, tid, sesid, full_path), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __string(path, full_path) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __assign_str(path, full_path); + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s", + __entry->xid, __entry->sesid, __entry->tid, + __get_str(path)) +) + +#define DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_enter_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + const char *full_path), \ + TP_ARGS(xid, tid, sesid, full_path)) + +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(query_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(hardlink_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter); +DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter); + + +DECLARE_EVENT_CLASS(smb3_inf_compound_done_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid), + TP_ARGS(xid, tid, sesid), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x", + __entry->xid, __entry->sesid, __entry->tid) +) + +#define DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_done_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid), \ + TP_ARGS(xid, tid, sesid)) + +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(hardlink_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done); +DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done); + + +DECLARE_EVENT_CLASS(smb3_inf_compound_err_class, + TP_PROTO(unsigned int xid, + __u32 tid, + __u64 sesid, + int rc), + TP_ARGS(xid, tid, sesid, rc), + TP_STRUCT__entry( + __field(unsigned int, xid) + __field(__u32, tid) + __field(__u64, sesid) + __field(int, rc) + ), + TP_fast_assign( + __entry->xid = xid; + __entry->tid = tid; + __entry->sesid = sesid; + __entry->rc = rc; + ), + TP_printk("xid=%u sid=0x%llx tid=0x%x rc=%d", + __entry->xid, __entry->sesid, __entry->tid, + __entry->rc) +) + +#define DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_inf_compound_err_class, smb3_##name, \ + TP_PROTO(unsigned int xid, \ + __u32 tid, \ + __u64 sesid, \ + int rc), \ + TP_ARGS(xid, tid, sesid, rc)) + +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(hardlink_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err); +DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err); + /* * For logging SMB3 Status code and Command for responses which return errors */ @@ -713,6 +836,7 @@ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_ARGS(currmid, hostname, credits)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); +DEFINE_SMB3_CREDIT_EVENT(credit_timeout); #endif /* _CIFS_TRACE_H */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7ce8a585abd6..1de8e996e566 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -486,15 +486,31 @@ smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer, } static int -wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, - int *credits, unsigned int *instance) +wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, + const int timeout, const int flags, + unsigned int *instance) { int rc; + int *credits; + int optype; + long int t; + + if (timeout < 0) + t = MAX_JIFFY_OFFSET; + else + t = msecs_to_jiffies(timeout); + + optype = flags & CIFS_OP_MASK; *instance = 0; + credits = server->ops->get_credits_field(server, optype); + /* Since an echo is already inflight, no need to wait to send another */ + if (*credits <= 0 && optype == CIFS_ECHO_OP) + return -EAGAIN; + spin_lock(&server->req_lock); - if (timeout == CIFS_ASYNC_OP) { + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) { /* oplock breaks must not be held up */ server->in_flight++; *credits -= 1; @@ -504,14 +520,21 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, } while (1) { - if (*credits <= 0) { + if (*credits < num_credits) { spin_unlock(&server->req_lock); cifs_num_waiters_inc(server); - rc = wait_event_killable(server->request_q, - has_credits(server, credits)); + rc = wait_event_killable_timeout(server->request_q, + has_credits(server, credits, num_credits), t); cifs_num_waiters_dec(server); - if (rc) - return rc; + if (!rc) { + trace_smb3_credit_timeout(server->CurrentMid, + server->hostname, num_credits); + cifs_dbg(VFS, "wait timed out after %d ms\n", + timeout); + return -ENOTSUPP; + } + if (rc == -ERESTARTSYS) + return -ERESTARTSYS; spin_lock(&server->req_lock); } else { if (server->tcpStatus == CifsExiting) { @@ -520,14 +543,52 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, } /* + * For normal commands, reserve the last MAX_COMPOUND + * credits to compound requests. + * Otherwise these compounds could be permanently + * starved for credits by single-credit requests. + * + * To prevent spinning CPU, block this thread until + * there are >MAX_COMPOUND credits available. + * But only do this is we already have a lot of + * credits in flight to avoid triggering this check + * for servers that are slow to hand out credits on + * new sessions. + */ + if (!optype && num_credits == 1 && + server->in_flight > 2 * MAX_COMPOUND && + *credits <= MAX_COMPOUND) { + spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); + rc = wait_event_killable_timeout( + server->request_q, + has_credits(server, credits, + MAX_COMPOUND + 1), + t); + cifs_num_waiters_dec(server); + if (!rc) { + trace_smb3_credit_timeout( + server->CurrentMid, + server->hostname, num_credits); + cifs_dbg(VFS, "wait timed out after %d ms\n", + timeout); + return -ENOTSUPP; + } + if (rc == -ERESTARTSYS) + return -ERESTARTSYS; + spin_lock(&server->req_lock); + continue; + } + + /* * Can not count locking commands against total * as they are allowed to block on server. */ /* update # of requests on the wire to server */ - if (timeout != CIFS_BLOCKING_OP) { - *credits -= 1; - server->in_flight++; + if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { + *credits -= num_credits; + server->in_flight += num_credits; *instance = server->reconnect_instance; } spin_unlock(&server->req_lock); @@ -538,16 +599,36 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int timeout, } static int -wait_for_free_request(struct TCP_Server_Info *server, const int timeout, - const int optype, unsigned int *instance) +wait_for_free_request(struct TCP_Server_Info *server, const int flags, + unsigned int *instance) { - int *val; + return wait_for_free_credits(server, 1, -1, flags, + instance); +} - val = server->ops->get_credits_field(server, optype); - /* Since an echo is already inflight, no need to wait to send another */ - if (*val <= 0 && optype == CIFS_ECHO_OP) - return -EAGAIN; - return wait_for_free_credits(server, timeout, val, instance); +static int +wait_for_compound_request(struct TCP_Server_Info *server, int num, + const int flags, unsigned int *instance) +{ + int *credits; + + credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK); + + spin_lock(&server->req_lock); + if (*credits < num) { + /* + * Return immediately if not too many requests in flight since + * we will likely be stuck on waiting for credits. + */ + if (server->in_flight < num - *credits) { + spin_unlock(&server->req_lock); + return -ENOTSUPP; + } + } + spin_unlock(&server->req_lock); + + return wait_for_free_credits(server, num, 60000, flags, + instance); } int @@ -646,16 +727,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, mid_handle_t *handle, void *cbdata, const int flags, const struct cifs_credits *exist_credits) { - int rc, timeout, optype; + int rc; struct mid_q_entry *mid; struct cifs_credits credits = { .value = 0, .instance = 0 }; unsigned int instance; + int optype; - timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; if ((flags & CIFS_HAS_CREDITS) == 0) { - rc = wait_for_free_request(server, timeout, optype, &instance); + rc = wait_for_free_request(server, flags, &instance); if (rc) return rc; credits.value = 1; @@ -871,18 +952,15 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, const int flags, const int num_rqst, struct smb_rqst *rqst, int *resp_buf_type, struct kvec *resp_iov) { - int i, j, rc = 0; - int timeout, optype; + int i, j, optype, rc = 0; struct mid_q_entry *midQ[MAX_COMPOUND]; bool cancelled_mid[MAX_COMPOUND] = {false}; struct cifs_credits credits[MAX_COMPOUND] = { { .value = 0, .instance = 0 } }; unsigned int instance; - unsigned int first_instance = 0; char *buf; - timeout = flags & CIFS_TIMEOUT_MASK; optype = flags & CIFS_OP_MASK; for (i = 0; i < num_rqst; i++) @@ -896,81 +974,24 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, if (ses->server->tcpStatus == CifsExiting) return -ENOENT; - spin_lock(&ses->server->req_lock); - if (ses->server->credits < num_rqst) { - /* - * Return immediately if not too many requests in flight since - * we will likely be stuck on waiting for credits. - */ - if (ses->server->in_flight < num_rqst - ses->server->credits) { - spin_unlock(&ses->server->req_lock); - return -ENOTSUPP; - } - } else { - /* enough credits to send the whole compounded request */ - ses->server->credits -= num_rqst; - ses->server->in_flight += num_rqst; - first_instance = ses->server->reconnect_instance; - } - spin_unlock(&ses->server->req_lock); - - if (first_instance) { - cifs_dbg(FYI, "Acquired %d credits at once\n", num_rqst); - for (i = 0; i < num_rqst; i++) { - credits[i].value = 1; - credits[i].instance = first_instance; - } - goto setup_rqsts; - } - /* - * There are not enough credits to send the whole compound request but - * there are requests in flight that may bring credits from the server. + * Wait for all the requests to become available. * This approach still leaves the possibility to be stuck waiting for * credits if the server doesn't grant credits to the outstanding - * requests. This should be fixed by returning immediately and letting - * a caller fallback to sequential commands instead of compounding. - * Ensure we obtain 1 credit per request in the compound chain. + * requests and if the client is completely idle, not generating any + * other requests. + * This can be handled by the eventual session reconnect. */ - for (i = 0; i < num_rqst; i++) { - rc = wait_for_free_request(ses->server, timeout, optype, - &instance); - - if (rc == 0) { - credits[i].value = 1; - credits[i].instance = instance; - /* - * All parts of the compound chain must get credits from - * the same session, otherwise we may end up using more - * credits than the server granted. If there were - * reconnects in between, return -EAGAIN and let callers - * handle it. - */ - if (i == 0) - first_instance = instance; - else if (first_instance != instance) { - i++; - rc = -EAGAIN; - } - } + rc = wait_for_compound_request(ses->server, num_rqst, flags, + &instance); + if (rc) + return rc; - if (rc) { - /* - * We haven't sent an SMB packet to the server yet but - * we already obtained credits for i requests in the - * compound chain - need to return those credits back - * for future use. Note that we need to call add_credits - * multiple times to match the way we obtained credits - * in the first place and to account for in flight - * requests correctly. - */ - for (j = 0; j < i; j++) - add_credits(ses->server, &credits[j], optype); - return rc; - } + for (i = 0; i < num_rqst; i++) { + credits[i].value = 1; + credits[i].instance = instance; } -setup_rqsts: /* * Make sure that we sign in the same order that we send on this socket * and avoid races inside tcp sendmsg code that could cause corruption @@ -981,14 +1002,12 @@ setup_rqsts: /* * All the parts of the compound chain belong obtained credits from the - * same session (see the appropriate checks above). In the same time - * there might be reconnects after those checks but before we acquired - * the srv_mutex. We can not use credits obtained from the previous + * same session. We can not use credits obtained from the previous * session to send this request. Check if there were reconnects after * we obtained credits and return -EAGAIN in such cases to let callers * handle it. */ - if (first_instance != ses->server->reconnect_instance) { + if (instance != ses->server->reconnect_instance) { mutex_unlock(&ses->server->srv_mutex); for (j = 0; j < num_rqst; j++) add_credits(ses->server, &credits[j], optype); @@ -1057,7 +1076,7 @@ setup_rqsts: smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); - if (timeout == CIFS_ASYNC_OP) + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) goto out; for (i = 0; i < num_rqst; i++) { @@ -1194,7 +1213,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses, int SendReceive(const unsigned int xid, struct cifs_ses *ses, struct smb_hdr *in_buf, struct smb_hdr *out_buf, - int *pbytes_returned, const int timeout) + int *pbytes_returned, const int flags) { int rc = 0; struct mid_q_entry *midQ; @@ -1225,7 +1244,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, return -EIO; } - rc = wait_for_free_request(ses->server, timeout, 0, &credits.instance); + rc = wait_for_free_request(ses->server, flags, &credits.instance); if (rc) return rc; @@ -1264,7 +1283,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc < 0) goto out; - if (timeout == CIFS_ASYNC_OP) + if ((flags & CIFS_TIMEOUT_MASK) == CIFS_ASYNC_OP) goto out; rc = wait_for_response(ses->server, midQ); @@ -1367,8 +1386,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return -EIO; } - rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, 0, - &instance); + rc = wait_for_free_request(ses->server, CIFS_BLOCKING_OP, &instance); if (rc) return rc; @@ -788,7 +788,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, address = pgoff_address(index, vma); /* - * Note because we provide start/end to follow_pte_pmd it will + * Note because we provide range to follow_pte_pmd it will * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ @@ -843,9 +843,8 @@ unlock_pte: static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, struct address_space *mapping, void *entry) { - unsigned long pfn; + unsigned long pfn, index, count; long ret = 0; - size_t size; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -894,17 +893,18 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_unlock_irq(xas); /* - * Even if dax_writeback_mapping_range() was given a wbc->range_start - * in the middle of a PMD, the 'index' we are given will be aligned to - * the start index of the PMD, as will the pfn we pull from 'entry'. + * If dax_writeback_mapping_range() was given a wbc->range_start + * in the middle of a PMD, the 'index' we use needs to be + * aligned to the start of the PMD. * This allows us to flush for PMD_SIZE and not have to worry about * partial PMD writebacks. */ pfn = dax_to_pfn(entry); - size = PAGE_SIZE << dax_entry_order(entry); + count = 1UL << dax_entry_order(entry); + index = xas->xa_index & ~(count - 1); - dax_entry_mkclean(mapping, xas->xa_index, pfn); - dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); + dax_entry_mkclean(mapping, index, pfn); + dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -917,8 +917,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); dax_wake_entry(xas, entry, false); - trace_dax_writeback_one(mapping->host, xas->xa_index, - size >> PAGE_SHIFT); + trace_dax_writeback_one(mapping->host, index, count); return ret; put_unlocked: @@ -1220,9 +1219,7 @@ static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; - if (error == -ENOMEM) - return VM_FAULT_OOM; - return VM_FAULT_SIGBUS; + return vmf_error(error); } /* diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c53814539070..553a3f3300ae 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -455,6 +455,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize_bits = 10; s->s_magic = DEVPTS_SUPER_MAGIC; s->s_op = &devpts_sops; + s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; error = -ENOMEM; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 031e5a82d556..06f77ca7f36e 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -97,7 +97,7 @@ config EXT4_FS_SECURITY extended attributes for file security labels, say N. config EXT4_DEBUG - bool "EXT4 debugging support" + bool "Ext4 debugging support" depends on EXT4_FS help Enables run-time debugging support for the ext4 filesystem. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5012ddb6daf9..82ffdacdc7fa 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -425,6 +425,9 @@ struct flex_groups { /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) +/* The only flags that should be swapped */ +#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) + /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { @@ -1661,6 +1664,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 +extern void ext4_update_dynamic_rev(struct super_block *sb); + #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ @@ -1669,6 +1674,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ @@ -1686,6 +1692,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ @@ -1703,6 +1710,7 @@ static inline bool ext4_has_feature_##name(struct super_block *sb) \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ + ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ @@ -2666,7 +2674,6 @@ do { \ #endif -extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); extern int ext4_update_rocompat_feature(handle_t *handle, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 79d986dbf5af..0f89f5190cd7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2956,14 +2956,17 @@ again: if (err < 0) goto out; - } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end && + partial.state == initial) { /* - * If there's an extent to the right its first cluster - * contains the immediate right boundary of the - * truncated/punched region. Set partial_cluster to - * its negative value so it won't be freed if shared - * with the current extent. The end < ee_block case - * is handled in ext4_ext_rm_leaf(). + * If we're punching, there's an extent to the right. + * If the partial cluster hasn't been set, set it to + * that extent's first cluster and its state to nofree + * so it won't be freed should it contain blocks to be + * removed. If it's already set (tofree/nofree), we're + * retrying and keep the original partial cluster info + * so a cluster marked tofree as a result of earlier + * extent removal is not lost. */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, @@ -4048,18 +4051,8 @@ out: } else allocated = ret; map->m_flags |= EXT4_MAP_NEW; - /* - * if we allocated more blocks than requested - * we need to make sure we unmap the extra block - * allocated. The actual needed block will get - * unmapped later when we find the buffer_head marked - * new. - */ - if (allocated > map->m_len) { - clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, - allocated - map->m_len); + if (allocated > map->m_len) allocated = map->m_len; - } map->m_len = allocated; map_out: diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index e22dcfab308b..46b24da33a28 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -231,6 +231,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) break; case DX_HASH_HALF_MD4_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; + /* fall through */ case DX_HASH_HALF_MD4: p = name; while (len > 0) { @@ -244,6 +245,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) break; case DX_HASH_TEA_UNSIGNED: str2hashbuf = str2hashbuf_unsigned; + /* fall through */ case DX_HASH_TEA: p = name; while (len > 0) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index bf7fa1507e81..c2225f0d31b5 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1183,18 +1183,21 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } + /* fall through */ case EXT4_IND_BLOCK: nr = i_data[EXT4_DIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } + /* fall through */ case EXT4_DIND_BLOCK: nr = i_data[EXT4_TIND_BLOCK]; if (nr) { ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } + /* fall through */ case EXT4_TIND_BLOCK: ; } @@ -1433,6 +1436,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); i_data[EXT4_IND_BLOCK] = 0; } + /* fall through */ case EXT4_IND_BLOCK: if (++n >= n2) return 0; @@ -1441,6 +1445,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); i_data[EXT4_DIND_BLOCK] = 0; } + /* fall through */ case EXT4_DIND_BLOCK: if (++n >= n2) return 0; @@ -1449,6 +1454,7 @@ do_indirects: ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); i_data[EXT4_TIND_BLOCK] = 0; } + /* fall through */ case EXT4_TIND_BLOCK: ; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4356ef6d728e..b54b261ded36 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -391,7 +391,7 @@ void ext4_da_update_reserve_space(struct inode *inode, * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && - (atomic_read(&inode->i_writecount) == 0)) + !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode); } @@ -678,8 +678,6 @@ found: if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { - clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, - map->m_len); ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -1194,7 +1192,6 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -2489,10 +2486,6 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) } BUG_ON(map->m_len == 0); - if (map->m_flags & EXT4_MAP_NEW) { - clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, - map->m_len); - } return 0; } @@ -2835,12 +2828,12 @@ retry: goto unplug; } ret = mpage_prepare_extent_to_map(&mpd); + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); ext4_put_io_end_defer(mpd.io_submit.io_end); mpd.io_submit.io_end = NULL; - /* Unlock pages we didn't use */ - mpage_release_unused_pages(&mpd, false); if (ret < 0) goto unplug; @@ -2908,10 +2901,11 @@ retry: handle = NULL; mpd.do_map = 0; } - /* Submit prepared bio */ - ext4_io_submit(&mpd.io_submit); /* Unlock pages we didn't use */ mpage_release_unused_pages(&mpd, give_up_on_write); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if @@ -5349,7 +5343,6 @@ static int ext4_do_update_inode(handle_t *handle, err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) goto out_brelse; - ext4_update_dynamic_rev(sb); ext4_set_feature_large_file(sb); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); @@ -6000,7 +5993,7 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); - BUFFER_TRACE(iloc.bh, "get_write_access"); + BUFFER_TRACE(iloc->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, iloc->bh); if (error) { brelse(iloc->bh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d26bcac291bb..3c4f8bb59f8a 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -63,18 +63,20 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; + unsigned long tmp; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_blocks, inode2->i_blocks); - swap(inode1->i_bytes, inode2->i_bytes); swap(inode1->i_atime, inode2->i_atime); swap(inode1->i_mtime, inode2->i_mtime); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); - swap(ei1->i_flags, ei2->i_flags); + tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; + ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | + (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); + ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); @@ -115,28 +117,42 @@ static long swap_inode_boot_loader(struct super_block *sb, int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; - - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || - IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || - ext4_has_inline_data(inode)) - return -EINVAL; - - if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) - return -EPERM; + qsize_t size, size_bl, diff; + blkcnt_t blocks; + unsigned short bytes; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); - filemap_flush(inode->i_mapping); - filemap_flush(inode_bl->i_mapping); - /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || + IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || + (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || + ext4_has_inline_data(inode)) { + err = -EINVAL; + goto journal_err_out; + } + + if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || + !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto journal_err_out; + } + + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto err_out; + + err = filemap_write_and_wait(inode_bl->i_mapping); + if (err) + goto err_out; + /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); @@ -147,7 +163,7 @@ static long swap_inode_boot_loader(struct super_block *sb, handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; - goto journal_err_out; + goto err_out; } /* Protect extent tree against block allocations via delalloc */ @@ -170,6 +186,13 @@ static long swap_inode_boot_loader(struct super_block *sb, memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } + err = dquot_initialize(inode); + if (err) + goto err_out1; + + size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; + size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; + diff = size - size_bl; swap_inode_data(inode, inode_bl); inode->i_ctime = inode_bl->i_ctime = current_time(inode); @@ -183,27 +206,51 @@ static long swap_inode_boot_loader(struct super_block *sb, err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { + /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); - } else { - err = ext4_mark_inode_dirty(handle, inode_bl); - if (err < 0) { - ext4_warning(inode_bl->i_sb, - "couldn't mark inode #%lu dirty (err %d)", - inode_bl->i_ino, err); - /* Revert all changes: */ - swap_inode_data(inode, inode_bl); - ext4_mark_inode_dirty(handle, inode); - ext4_mark_inode_dirty(handle, inode_bl); - } + goto err_out1; + } + + blocks = inode_bl->i_blocks; + bytes = inode_bl->i_bytes; + inode_bl->i_blocks = inode->i_blocks; + inode_bl->i_bytes = inode->i_bytes; + err = ext4_mark_inode_dirty(handle, inode_bl); + if (err < 0) { + /* No need to update quota information. */ + ext4_warning(inode_bl->i_sb, + "couldn't mark inode #%lu dirty (err %d)", + inode_bl->i_ino, err); + goto revert; + } + + /* Bootloader inode should not be counted into quota information. */ + if (diff > 0) + dquot_free_space(inode, diff); + else + err = dquot_alloc_space(inode, -1 * diff); + + if (err < 0) { +revert: + /* Revert all changes: */ + inode_bl->i_blocks = blocks; + inode_bl->i_bytes = bytes; + swap_inode_data(inode, inode_bl); + ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, inode_bl); } + +err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); +err_out: + up_write(&EXT4_I(inode)->i_mmap_sem); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e2248083cdca..6fb76d408093 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4176,9 +4176,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; - if ((size == isize) && - !ext4_fs_is_busy(sbi) && - (atomic_read(&ac->ac_inode->i_writecount) == 0)) { + if ((size == isize) && !ext4_fs_is_busy(sbi) && + !inode_is_open_for_write(ac->ac_inode)) { ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } @@ -4258,7 +4257,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, - atomic_read(&ar->inode->i_writecount) ? "" : "non-"); + inode_is_open_for_write(ar->inode) ? "" : "non-"); return 0; } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 6f5305e9a6ac..3e9298e6a705 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -468,10 +468,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ext4_io_submit(io); continue; } - if (buffer_new(bh)) { + if (buffer_new(bh)) clear_buffer_new(bh); - clean_bdev_bh_alias(bh); - } set_buffer_async_write(bh); nr_to_submit++; } while ((bh = bh->b_this_page) != head); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 48421de803b7..3d9b18505c0c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1960,7 +1960,8 @@ retry: le16_to_cpu(es->s_reserved_gdt_blocks); n_group = n_desc_blocks * EXT4_DESC_PER_BLOCK(sb); n_blocks_count = (ext4_fsblk_t)n_group * - EXT4_BLOCKS_PER_GROUP(sb); + EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); n_group--; /* set to last group number */ } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 60da0a6e4d86..f5b828bf1299 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2249,7 +2249,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - ext4_update_dynamic_rev(sb); if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 5e4e78fc0b3a..616c075da062 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -30,6 +30,7 @@ typedef enum { attr_feature, attr_pointer_ui, attr_pointer_atomic, + attr_journal_task, } attr_id_t; typedef enum { @@ -125,6 +126,14 @@ static ssize_t trigger_test_error(struct ext4_sb_info *sbi, return count; } +static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf) +{ + if (!sbi->s_journal) + return snprintf(buf, PAGE_SIZE, "<none>\n"); + return snprintf(buf, PAGE_SIZE, "%d\n", + task_pid_vnr(sbi->s_journal->j_task)); +} + #define EXT4_ATTR(_name,_mode,_id) \ static struct ext4_attr ext4_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -188,6 +197,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); EXT4_ATTR(first_error_time, 0444, first_error_time); EXT4_ATTR(last_error_time, 0444, last_error_time); +EXT4_ATTR(journal_task, 0444, journal_task); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -217,6 +227,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(errors_count), ATTR_LIST(first_error_time), ATTR_LIST(last_error_time), + ATTR_LIST(journal_task), NULL, }; @@ -304,6 +315,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_first_error_time); case attr_last_error_time: return print_tstamp(buf, sbi->s_es, s_last_error_time); + case attr_journal_task: + return journal_task_show(sbi, buf); } return 0; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 86ed9c686249..dc82e7757f67 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -829,6 +829,7 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); + bh = NULL; goto out; } @@ -2903,6 +2904,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (error == -EIO) EXT4_ERROR_INODE(inode, "block %llu read error", EXT4_I(inode)->i_file_acl); + bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); @@ -3059,6 +3061,7 @@ ext4_xattr_block_cache_find(struct inode *inode, if (IS_ERR(bh)) { if (PTR_ERR(bh) == -ENOMEM) return NULL; + bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f955cd3e0677..a98e1b02279e 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -306,8 +306,9 @@ static int f2fs_write_meta_pages(struct address_space *mapping, goto skip_write; /* collect a number of dirty meta pages and write together */ - if (wbc->for_kupdate || - get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_META) < + nr_pages_to_skip(sbi, META)) goto skip_write; /* if locked failed, cp will flush dirty pages instead */ @@ -405,7 +406,7 @@ static int f2fs_set_meta_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } @@ -956,7 +957,7 @@ void f2fs_update_dirty_page(struct inode *inode, struct page *page) inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); } @@ -1259,10 +1260,17 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) + __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + else + __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); + if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); - else - __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); + /* + * TODO: we count on fsck.f2fs to clear this flag until we figure out + * missing cases which clear it incorrectly. + */ if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 568e1d09eb48..9727944139f2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -301,9 +301,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, for (; start < F2FS_IO_SIZE(sbi); start++) { struct page *page = mempool_alloc(sbi->write_io_dummy, - GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + GFP_NOIO | __GFP_NOFAIL); f2fs_bug_on(sbi, !page); + zero_user_segment(page, 0, PAGE_SIZE); SetPagePrivate(page); set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); lock_page(page); @@ -1553,6 +1554,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (last_block > last_block_in_file) last_block = last_block_in_file; + /* just zeroing out page which is beyond EOF */ + if (block_in_file >= last_block) + goto zero_out; /* * Map blocks using the previous result first. */ @@ -1565,16 +1569,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping, * Then do more f2fs_map_blocks() calls until we are * done with this page. */ - map.m_flags = 0; - - if (block_in_file < last_block) { - map.m_lblk = block_in_file; - map.m_len = last_block - block_in_file; + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; - if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_DEFAULT)) - goto set_error_page; - } + if (f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT)) + goto set_error_page; got_it: if ((map.m_flags & F2FS_MAP_MAPPED)) { block_nr = map.m_pblk + block_in_file - map.m_lblk; @@ -1589,6 +1588,7 @@ got_it: DATA_GENERIC)) goto set_error_page; } else { +zero_out: zero_user_segment(page, 0, PAGE_SIZE); if (!PageUptodate(page)) SetPageUptodate(page); @@ -1863,8 +1863,13 @@ got_it: if (fio->need_lock == LOCK_REQ) f2fs_unlock_op(fio->sbi); err = f2fs_inplace_write_data(fio); - if (err && PageWriteback(page)) - end_page_writeback(page); + if (err) { + if (f2fs_encrypted_file(inode)) + fscrypt_pullback_bio_page(&fio->encrypted_page, + true); + if (PageWriteback(page)) + end_page_writeback(page); + } trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); return err; @@ -2315,7 +2320,8 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); - f2fs_truncate_blocks(inode, i_size, true, true); + if (!IS_NOQUOTA(inode)) + f2fs_truncate_blocks(inode, i_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -2585,14 +2591,11 @@ static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode, { struct f2fs_private_dio *dio; bool write = (bio_op(bio) == REQ_OP_WRITE); - int err; dio = f2fs_kzalloc(F2FS_I_SB(inode), sizeof(struct f2fs_private_dio), GFP_NOFS); - if (!dio) { - err = -ENOMEM; + if (!dio) goto out; - } dio->inode = inode; dio->orig_end_io = bio->bi_end_io; @@ -2710,12 +2713,10 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, clear_cold_data(page); - /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) return f2fs_drop_inmem_page(inode, page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); } int f2fs_release_page(struct page *page, gfp_t wait) @@ -2729,8 +2730,7 @@ int f2fs_release_page(struct page *page, gfp_t wait) return 0; clear_cold_data(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); return 1; } @@ -2798,12 +2798,8 @@ int f2fs_migrate_page(struct address_space *mapping, return -EAGAIN; } - /* - * A reference is expected if PagePrivate set when move mapping, - * however F2FS breaks this for maintaining dirty page counts when - * truncating pages. So here adjusting the 'extra_count' make it work. - */ - extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + /* one extra reference was held for atomic_write page */ + extra_count = atomic_written ? 1 : 0; rc = migrate_page_move_mapping(mapping, newpage, page, mode, extra_count); if (rc != MIGRATEPAGE_SUCCESS) { @@ -2824,9 +2820,10 @@ int f2fs_migrate_page(struct address_space *mapping, get_page(newpage); } - if (PagePrivate(page)) - SetPagePrivate(newpage); - set_page_private(newpage, page_private(page)); + if (PagePrivate(page)) { + f2fs_set_page_private(newpage, page_private(page)); + f2fs_clear_page_private(page); + } if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fd7f170e2f2d..99e9a5c37b71 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -96,8 +96,10 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->free_secs = free_sections(sbi); si->prefree_count = prefree_segments(sbi); si->dirty_count = dirty_segments(sbi); - si->node_pages = NODE_MAPPING(sbi)->nrpages; - si->meta_pages = META_MAPPING(sbi)->nrpages; + if (sbi->node_inode) + si->node_pages = NODE_MAPPING(sbi)->nrpages; + if (sbi->meta_inode) + si->meta_pages = META_MAPPING(sbi)->nrpages; si->nats = NM_I(sbi)->nat_cnt; si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); @@ -175,7 +177,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi) static void update_mem_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned npages; int i; if (si->base_mem) @@ -258,10 +259,14 @@ get_cache: sizeof(struct extent_node); si->page_mem = 0; - npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; - npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + if (sbi->node_inode) { + unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } + if (sbi->meta_inode) { + unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; + } } static int stat_show(struct seq_file *s, void *v) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 713b36a10a79..59bc46017855 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -728,7 +728,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, !f2fs_truncate_hole(dir, page->index, page->index + 1)) { f2fs_clear_page_cache_dirty_tag(page); clear_page_dirty_for_io(page); - ClearPagePrivate(page); + f2fs_clear_page_private(page); ClearPageUptodate(page); clear_cold_data(page); inode_dec_dirty_pages(dir); @@ -800,6 +800,10 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (de->name_len == 0) { bit_pos++; ctx->pos = start_pos + bit_pos; + printk_ratelimited( + "%s, invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } @@ -810,7 +814,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, /* check memory boundary before moving forward */ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); - if (unlikely(bit_pos > d->max)) { + if (unlikely(bit_pos > d->max || + le16_to_cpu(de->name_len) > F2FS_NAME_LEN)) { f2fs_msg(sbi->sb, KERN_WARNING, "%s: corrupted namelen=%d, run fsck to fix.", __func__, le16_to_cpu(de->name_len)); @@ -891,7 +896,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_get_lock_data_page(inode, n, false); + dentry_page = f2fs_find_data_page(inode, n); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); if (err == -ENOENT) { @@ -909,11 +914,11 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); if (err) { - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); break; } - f2fs_put_page(dentry_page, 1); + f2fs_put_page(dentry_page, 0); } out_free: fscrypt_fname_free_buffer(&fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 1cb0fcc67d2d..caf77fe8ac07 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -506,7 +506,7 @@ static void f2fs_update_extent_tree_range(struct inode *inode, unsigned int end = fofs + len; unsigned int pos = (unsigned int)fofs; bool updated = false; - bool leftmost; + bool leftmost = false; if (!et) return; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7ea5c9cede37..87f75ebd2fd6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -190,6 +190,8 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ +#define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ struct cp_control { int reason; @@ -253,7 +255,7 @@ struct discard_entry { /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ - (MAX_PLIST_NUM - 1) : (blk_num - 1)) + (MAX_PLIST_NUM - 1) : ((blk_num) - 1)) enum { D_PREP, /* initial */ @@ -309,6 +311,7 @@ struct discard_policy { bool sync; /* submit discard with REQ_SYNC flag */ bool ordered; /* issue discard by lba order */ unsigned int granularity; /* discard granularity */ + int timeout; /* discard timeout for put_super */ }; struct discard_cmd_control { @@ -455,7 +458,6 @@ struct f2fs_flush_device { /* for inline stuff */ #define DEF_INLINE_RESERVED_SIZE 1 -#define DEF_MIN_INLINE_SIZE 1 static inline int get_extra_isize(struct inode *inode); static inline int get_inline_xattr_addrs(struct inode *inode); #define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ @@ -1098,6 +1100,7 @@ enum { SBI_IS_SHUTDOWN, /* shutdown by ioctl */ SBI_IS_RECOVERED, /* recovered orphan/data */ SBI_CP_DISABLED, /* CP was disabled last mount */ + SBI_CP_DISABLED_QUICK, /* CP was disabled quickly */ SBI_QUOTA_NEED_FLUSH, /* need to flush quota info in CP */ SBI_QUOTA_SKIP_FLUSH, /* skip flushing quota in current CP */ SBI_QUOTA_NEED_REPAIR, /* quota file may be corrupted */ @@ -1109,6 +1112,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -1237,8 +1241,6 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ - u32 s_next_generation; /* for NFS support */ - /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ @@ -1798,13 +1800,12 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || - count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA || - count_type == F2FS_RD_DATA || count_type == F2FS_RD_NODE || - count_type == F2FS_RD_META) - return; - - set_sbi_flag(sbi, SBI_IS_DIRTY); + if (count_type == F2FS_DIRTY_DENTS || + count_type == F2FS_DIRTY_NODES || + count_type == F2FS_DIRTY_META || + count_type == F2FS_DIRTY_QDATA || + count_type == F2FS_DIRTY_IMETA) + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) @@ -2156,10 +2157,17 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) || get_pages(sbi, F2FS_WB_CP_DATA) || get_pages(sbi, F2FS_DIO_READ) || - get_pages(sbi, F2FS_DIO_WRITE) || - atomic_read(&SM_I(sbi)->dcc_info->queued_discard) || - atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + get_pages(sbi, F2FS_DIO_WRITE)) return false; + + if (SM_I(sbi) && SM_I(sbi)->dcc_info && + atomic_read(&SM_I(sbi)->dcc_info->queued_discard)) + return false; + + if (SM_I(sbi) && SM_I(sbi)->fcc_info && + atomic_read(&SM_I(sbi)->fcc_info->queued_flush)) + return false; + return f2fs_time_over(sbi, type); } @@ -2300,11 +2308,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) #define F2FS_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define F2FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ #define F2FS_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ +#define F2FS_NOCOW_FL 0x00800000 /* Do not cow file */ #define F2FS_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define F2FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define F2FS_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define F2FS_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define F2FS_FL_USER_VISIBLE 0x30CBDFFF /* User visible flags */ #define F2FS_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ /* Flags we can manipulate with through F2FS_IOC_FSSETXATTR */ @@ -2761,9 +2770,9 @@ static inline int get_inline_xattr_addrs(struct inode *inode) #define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) #define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ - ((offsetof(typeof(*f2fs_inode), field) + \ + ((offsetof(typeof(*(f2fs_inode)), field) + \ sizeof((f2fs_inode)->field)) \ - <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) { @@ -2792,8 +2801,8 @@ static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) -#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO(fio->type) == META && \ - (!is_read_io(fio->op) || fio->is_meta)) +#define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META && \ + (!is_read_io((fio)->op) || (fio)->is_meta)) bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); @@ -2825,13 +2834,33 @@ static inline bool is_valid_data_blkaddr(struct f2fs_sb_info *sbi, return true; } +static inline void f2fs_set_page_private(struct page *page, + unsigned long data) +{ + if (PagePrivate(page)) + return; + + get_page(page); + SetPagePrivate(page); + set_page_private(page, data); +} + +static inline void f2fs_clear_page_private(struct page *page) +{ + if (!PagePrivate(page)) + return; + + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); +} + /* * file.c */ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); void f2fs_truncate_data_blocks(struct dnode_of_data *dn); -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write); +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); int f2fs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); @@ -3005,7 +3034,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi); @@ -3610,8 +3639,6 @@ extern void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #define f2fs_build_fault_attr(sbi, rate, type) do { } while (0) #endif -#endif - static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) { #ifdef CONFIG_QUOTA @@ -3624,3 +3651,5 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) #endif return false; } + +#endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ba5954f41e14..5742ab8b57dc 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -589,8 +589,7 @@ truncate_out: return 0; } -int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, - bool buf_write) +int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; @@ -598,7 +597,6 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, int count = 0, err = 0; struct page *ipage; bool truncate_page = false; - int flag = buf_write ? F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO; trace_f2fs_truncate_blocks_enter(inode, from); @@ -608,7 +606,7 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock, goto free_partial; if (lock) - __do_map_lock(sbi, flag, true); + f2fs_lock_op(sbi); ipage = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { @@ -646,7 +644,7 @@ free_next: err = f2fs_truncate_inode_blocks(inode, free_from); out: if (lock) - __do_map_lock(sbi, flag, false); + f2fs_unlock_op(sbi); free_partial: /* lastly zero out the first data page */ if (!err) @@ -681,7 +679,7 @@ int f2fs_truncate(struct inode *inode) return err; } - err = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + err = f2fs_truncate_blocks(inode, i_size_read(inode), true); if (err) return err; @@ -768,7 +766,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; - bool size_changed = false; if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; @@ -843,8 +840,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) down_write(&F2FS_I(inode)->i_sem); F2FS_I(inode)->last_disk_size = i_size_read(inode); up_write(&F2FS_I(inode)->i_sem); - - size_changed = true; } __setattr_copy(inode, attr); @@ -858,7 +853,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } /* file size may changed here */ - f2fs_mark_inode_dirty_sync(inode, size_changed); + f2fs_mark_inode_dirty_sync(inode, true); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1262,7 +1257,7 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) new_size = i_size_read(inode) - len; truncate_pagecache(inode, new_size); - ret = f2fs_truncate_blocks(inode, new_size, true, false); + ret = f2fs_truncate_blocks(inode, new_size, true); up_write(&F2FS_I(inode)->i_mmap_sem); if (!ret) f2fs_i_size_write(inode, new_size); @@ -1447,7 +1442,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); down_write(&F2FS_I(inode)->i_mmap_sem); - ret = f2fs_truncate_blocks(inode, i_size_read(inode), true, false); + ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); up_write(&F2FS_I(inode)->i_mmap_sem); if (ret) return ret; @@ -1651,6 +1646,8 @@ static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) flags |= F2FS_ENCRYPT_FL; if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) flags |= F2FS_INLINE_DATA_FL; + if (is_inode_flag_set(inode, FI_PIN_FILE)) + flags |= F2FS_NOCOW_FL; flags &= F2FS_FL_USER_VISIBLE; @@ -1750,10 +1747,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (!get_dirty_pages(inode)) - goto skip_flush; - - f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + /* + * Should wait end_io to count F2FS_WB_CP_DATA correctly by + * f2fs_is_atomic_file. + */ + if (get_dirty_pages(inode)) + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); @@ -1761,7 +1760,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } -skip_flush: + set_inode_flag(inode, FI_ATOMIC_FILE); clear_inode_flag(inode, FI_ATOMIC_REVOKE_REQUEST); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -1968,11 +1967,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) break; case F2FS_GOING_DOWN_NEED_FSCK: set_sbi_flag(sbi, SBI_NEED_FSCK); + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + set_sbi_flag(sbi, SBI_IS_DIRTY); /* do checkpoint only */ ret = f2fs_sync_fs(sb, 1); - if (ret) - goto out; - break; + goto out; default: ret = -EINVAL; goto out; @@ -1988,6 +1987,9 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) out: if (in != F2FS_GOING_DOWN_FULLSYNC) mnt_drop_write_file(filp); + + trace_f2fs_shutdown(sbi, in, ret); + return ret; } @@ -2871,8 +2873,8 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg) __u32 pin; int ret = 0; - if (!inode_owner_or_capable(inode)) - return -EACCES; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (get_user(pin, (__u32 __user *)arg)) return -EFAULT; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index d636cbcf68f2..bb6a152310ef 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -298,7 +298,7 @@ process_inline: clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { - if (f2fs_truncate_blocks(inode, 0, false, false)) + if (f2fs_truncate_blocks(inode, 0, false)) return false; goto process_inline; } @@ -470,7 +470,7 @@ static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) return 0; punch_dentry_pages: truncate_inode_pages(&dir->i_data, 0); - f2fs_truncate_blocks(dir, 0, false, false); + f2fs_truncate_blocks(dir, 0, false); f2fs_remove_dirty_inode(dir); return err; } @@ -659,6 +659,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (IS_ERR(ipage)) return PTR_ERR(ipage); + /* + * f2fs_readdir was protected by inode.i_rwsem, it is safe to access + * ipage without page's lock held. + */ + unlock_page(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -667,7 +673,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 1); + f2fs_put_page(ipage, 0); return err < 0 ? err : 0; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d910a820ae67..e7f2e8759315 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -14,6 +14,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include <trace/events/f2fs.h> @@ -248,6 +249,20 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (f2fs_has_extra_attr(inode) && + f2fs_sb_has_flexible_inline_xattr(sbi) && + f2fs_has_inline_xattr(inode) && + (!fi->i_inline_xattr_size || + fi->i_inline_xattr_size > MAX_INLINE_XATTR_SIZE)) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: inode (ino=%lx) has corrupted " + "i_inline_xattr_size: %d, max: %zu", + __func__, inode->i_ino, fi->i_inline_xattr_size, + MAX_INLINE_XATTR_SIZE); + return false; + } + if (F2FS_I(inode)->extent_tree) { struct extent_info *ei = &F2FS_I(inode)->extent_tree->largest; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e967d27c1a89..f5e34e467003 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/sched.h> #include <linux/ctype.h> +#include <linux/random.h> #include <linux/dcache.h> #include <linux/namei.h> #include <linux/quotaops.h> @@ -50,7 +51,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; - inode->i_generation = sbi->s_next_generation++; + inode->i_generation = prandom_u32(); if (S_ISDIR(inode->i_mode)) F2FS_I(inode)->i_current_depth = 1; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4f450e573312..3f99ab288695 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1920,7 +1920,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, f2fs_balance_fs_bg(sbi); /* collect a number of dirty node pages and write together */ - if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) + if (wbc->sync_mode != WB_SYNC_ALL && + get_pages(sbi, F2FS_DIRTY_NODES) < + nr_pages_to_skip(sbi, NODE)) goto skip_write; if (wbc->sync_mode == WB_SYNC_ALL) @@ -1959,7 +1961,7 @@ static int f2fs_set_node_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); - SetPagePrivate(page); + f2fs_set_page_private(page, 0); f2fs_trace_pid(page); return 1; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 9b79056d705d..aa7fe79b62b2 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -191,8 +191,7 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) f2fs_trace_pid(page); - set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); - SetPagePrivate(page); + f2fs_set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -215,7 +214,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) } static int __revoke_inmem_pages(struct inode *inode, - struct list_head *head, bool drop, bool recover) + struct list_head *head, bool drop, bool recover, + bool trylock) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct inmem_pages *cur, *tmp; @@ -227,7 +227,16 @@ static int __revoke_inmem_pages(struct inode *inode, if (drop) trace_f2fs_commit_inmem_page(page, INMEM_DROP); - lock_page(page); + if (trylock) { + /* + * to avoid deadlock in between page lock and + * inmem_lock. + */ + if (!trylock_page(page)) + continue; + } else { + lock_page(page); + } f2fs_wait_on_page_writeback(page, DATA, true, true); @@ -270,8 +279,7 @@ next: ClearPageUptodate(page); clear_cold_data(page); } - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 1); list_del(&cur->list); @@ -318,13 +326,19 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - mutex_lock(&fi->inmem_lock); - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); - mutex_unlock(&fi->inmem_lock); + while (!list_empty(&fi->inmem_pages)) { + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, true); + + if (list_empty(&fi->inmem_pages)) { + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } + mutex_unlock(&fi->inmem_lock); + } clear_inode_flag(inode, FI_ATOMIC_FILE); fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; @@ -354,8 +368,7 @@ void f2fs_drop_inmem_page(struct inode *inode, struct page *page) kmem_cache_free(inmem_entry_slab, cur); ClearPageUptodate(page); - set_page_private(page, 0); - ClearPagePrivate(page); + f2fs_clear_page_private(page); f2fs_put_page(page, 0); trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); @@ -429,12 +442,15 @@ retry: * recovery or rewrite & commit last transaction. For other * error number, revoking was done by filesystem itself. */ - err = __revoke_inmem_pages(inode, &revoke_list, false, true); + err = __revoke_inmem_pages(inode, &revoke_list, + false, true, false); /* drop all uncommitted pages */ - __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + __revoke_inmem_pages(inode, &fi->inmem_pages, + true, false, false); } else { - __revoke_inmem_pages(inode, &revoke_list, false, false); + __revoke_inmem_pages(inode, &revoke_list, + false, false, false); } return err; @@ -542,9 +558,13 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { - struct bio *bio = f2fs_bio_alloc(sbi, 0, true); + struct bio *bio; int ret; + bio = f2fs_bio_alloc(sbi, 0, false); + if (!bio) + return -ENOMEM; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; bio_set_dev(bio, bdev); ret = submit_bio_wait(bio); @@ -868,6 +888,9 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi) if (holes[DATA] > ovp || holes[NODE] > ovp) return -EAGAIN; + if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && + dirty_segments(sbi) > overprovision_segments(sbi)) + return -EAGAIN; return 0; } @@ -1037,6 +1060,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST; dpolicy->io_aware_gran = MAX_PLIST_NUM; + dpolicy->timeout = 0; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1059,6 +1083,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->max_requests = UINT_MAX; dpolicy->io_aware = false; + /* we need to issue all to keep CP_TRIMMED_FLAG */ + dpolicy->granularity = 1; } } @@ -1424,7 +1450,14 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi, int i, issued = 0; bool io_interrupted = false; + if (dpolicy->timeout != 0) + f2fs_update_time(sbi, dpolicy->timeout); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + if (dpolicy->timeout != 0 && + f2fs_time_over(sbi, dpolicy->timeout)) + break; + if (i + 1 < dpolicy->granularity) break; @@ -1611,7 +1644,7 @@ void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super */ -bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; @@ -1619,6 +1652,7 @@ bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); + dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT; __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); @@ -3164,10 +3198,10 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) stat_inc_inplace_blocks(fio->sbi); err = f2fs_submit_page_bio(fio); - if (!err) + if (!err) { update_device_state(fio); - - f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + } return err; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index a77f76f528b6..5c7ed0442d6e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -865,7 +865,7 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) } } mutex_unlock(&dcc->cmd_lock); - if (!wakeup) + if (!wakeup || !is_idle(sbi, DISCARD_TIME)) return; wake_up: dcc->discard_wake = 1; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d1ccc52afc93..f2aaa2cc6b3e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -269,7 +269,7 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype, if (!qname) { f2fs_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); - return -EINVAL; + return -ENOMEM; } if (F2FS_OPTION(sbi).s_qf_names[qtype]) { if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0) @@ -586,7 +586,7 @@ static int parse_options(struct super_block *sb, char *options) case Opt_io_size_bits: if (args->from && match_int(args, &arg)) return -EINVAL; - if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_PAGES)) { f2fs_msg(sb, KERN_WARNING, "Not support %d, larger than %d", 1 << arg, BIO_MAX_PAGES); @@ -821,6 +821,8 @@ static int parse_options(struct super_block *sb, char *options) } if (test_opt(sbi, INLINE_XATTR_SIZE)) { + int min_size, max_size; + if (!f2fs_sb_has_extra_attr(sbi) || !f2fs_sb_has_flexible_inline_xattr(sbi)) { f2fs_msg(sb, KERN_ERR, @@ -834,14 +836,15 @@ static int parse_options(struct super_block *sb, char *options) "set with inline_xattr option"); return -EINVAL; } - if (!F2FS_OPTION(sbi).inline_xattr_size || - F2FS_OPTION(sbi).inline_xattr_size >= - DEF_ADDRS_PER_INODE - - F2FS_TOTAL_EXTRA_ATTR_SIZE - - DEF_INLINE_RESERVED_SIZE - - DEF_MIN_INLINE_SIZE) { + + min_size = sizeof(struct f2fs_xattr_header) / sizeof(__le32); + max_size = MAX_INLINE_XATTR_SIZE; + + if (F2FS_OPTION(sbi).inline_xattr_size < min_size || + F2FS_OPTION(sbi).inline_xattr_size > max_size) { f2fs_msg(sb, KERN_ERR, - "inline xattr size is out of range"); + "inline xattr size is out of range: %d ~ %d", + min_size, max_size); return -EINVAL; } } @@ -915,6 +918,10 @@ static int f2fs_drop_inode(struct inode *inode) sb_start_intwrite(inode->i_sb); f2fs_i_size_write(inode, 0); + f2fs_submit_merged_write_cond(F2FS_I_SB(inode), + inode, NULL, 0, DATA); + truncate_inode_pages_final(inode->i_mapping); + if (F2FS_HAS_BLOCKS(inode)) f2fs_truncate(inode); @@ -1048,7 +1055,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - dropped = f2fs_wait_discard_bios(sbi); + dropped = f2fs_issue_discard_timeout(sbi); if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) && !sbi->discard_blks && !dropped) { @@ -1075,7 +1082,10 @@ static void f2fs_put_super(struct super_block *sb) f2fs_bug_on(sbi, sbi->fsync_node_num); iput(sbi->node_inode); + sbi->node_inode = NULL; + iput(sbi->meta_inode); + sbi->meta_inode = NULL; /* * iput() can update stat information, if f2fs_write_checkpoint() @@ -1455,9 +1465,16 @@ static int f2fs_enable_quotas(struct super_block *sb); static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) { + unsigned int s_flags = sbi->sb->s_flags; struct cp_control cpc; - int err; + int err = 0; + int ret; + if (s_flags & SB_RDONLY) { + f2fs_msg(sbi->sb, KERN_ERR, + "checkpoint=disable on readonly fs"); + return -EINVAL; + } sbi->sb->s_flags |= SB_ACTIVE; f2fs_update_time(sbi, DISABLE_TIME); @@ -1465,18 +1482,24 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { mutex_lock(&sbi->gc_mutex); err = f2fs_gc(sbi, true, false, NULL_SEGNO); - if (err == -ENODATA) + if (err == -ENODATA) { + err = 0; break; + } if (err && err != -EAGAIN) - return err; + break; } - err = sync_filesystem(sbi->sb); - if (err) - return err; + ret = sync_filesystem(sbi->sb); + if (ret || err) { + err = ret ? ret: err; + goto restore_flag; + } - if (f2fs_disable_cp_again(sbi)) - return -EAGAIN; + if (f2fs_disable_cp_again(sbi)) { + err = -EAGAIN; + goto restore_flag; + } mutex_lock(&sbi->gc_mutex); cpc.reason = CP_PAUSE; @@ -1485,7 +1508,9 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) sbi->unusable_block_count = 0; mutex_unlock(&sbi->gc_mutex); - return 0; +restore_flag: + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) @@ -2023,6 +2048,12 @@ void f2fs_quota_off_umount(struct super_block *sb) set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); } } + /* + * In case of checkpoint=disable, we must flush quota blocks. + * This can cause NULL exception for node_inode in end_io, since + * put_super already dropped it. + */ + sync_filesystem(sb); } static void f2fs_truncate_quota_inode_pages(struct super_block *sb) @@ -2703,6 +2734,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = + DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); for (i = 0; i < NR_COUNT_TYPE; i++) @@ -3022,10 +3055,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) struct f2fs_super_block *raw_super; struct inode *root; int err; - bool retry = true, need_fsck = false; + bool skip_recovery = false, need_fsck = false; char *options = NULL; int recovery, i, valid_super_block; struct curseg_info *seg_i; + int retry_cnt = 1; try_onemore: err = -EINVAL; @@ -3097,7 +3131,6 @@ try_onemore: sb->s_maxbytes = sbi->max_file_blocks << le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; - get_random_bytes(&sbi->s_next_generation, sizeof(u32)); #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; @@ -3200,6 +3233,10 @@ try_onemore: if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_QUOTA_NEED_FSCK_FLAG)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + if (__is_set_ckpt_flags(F2FS_CKPT(sbi), CP_DISABLED_QUICK_FLAG)) { + set_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_QUICK_INTERVAL; + } /* Initialize device list */ err = f2fs_scan_devices(sbi); @@ -3288,7 +3325,7 @@ try_onemore: sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; - goto free_root_inode; + goto free_node_inode; } err = f2fs_register_sysfs(sbi); @@ -3310,7 +3347,7 @@ try_onemore: goto free_meta; if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) - goto skip_recovery; + goto reset_checkpoint; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -3327,11 +3364,13 @@ try_onemore: if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - if (!retry) - goto skip_recovery; + if (skip_recovery) + goto reset_checkpoint; err = f2fs_recover_fsync_data(sbi, false); if (err < 0) { + if (err != -ENOMEM) + skip_recovery = true; need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); @@ -3347,14 +3386,14 @@ try_onemore: goto free_meta; } } -skip_recovery: +reset_checkpoint: /* f2fs_recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto free_meta; + goto sync_free_meta; } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { f2fs_enable_checkpoint(sbi); } @@ -3367,7 +3406,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) - goto free_meta; + goto sync_free_meta; } kvfree(options); @@ -3387,8 +3426,14 @@ skip_recovery: cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); + clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); return 0; +sync_free_meta: + /* safe to flush all the data */ + sync_filesystem(sbi->sb); + retry_cnt = 0; + free_meta: #ifdef CONFIG_QUOTA f2fs_truncate_quota_inode_pages(sb); @@ -3402,6 +3447,8 @@ free_meta: * falls into an infinite loop in f2fs_sync_meta_pages(). */ truncate_inode_pages_final(META_MAPPING(sbi)); + /* evict some inodes being cached by GC */ + evict_inodes(sb); f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); @@ -3410,6 +3457,7 @@ free_node_inode: f2fs_release_ino_entry(sbi, true); truncate_inode_pages_final(NODE_MAPPING(sbi)); iput(sbi->node_inode); + sbi->node_inode = NULL; free_stats: f2fs_destroy_stats(sbi); free_nm: @@ -3422,6 +3470,7 @@ free_devices: free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); + sbi->meta_inode = NULL; free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: @@ -3443,8 +3492,8 @@ free_sbi: kvfree(sbi); /* give only one another chance */ - if (retry) { - retry = false; + if (retry_cnt > 0 && skip_recovery) { + retry_cnt--; shrink_dcache_sb(sb); goto try_onemore; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 70da6801c86f..729f46a3c9ee 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -222,6 +222,8 @@ out: #ifdef CONFIG_F2FS_FAULT_INJECTION if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) return -EINVAL; + if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) + return -EINVAL; #endif if (a->struct_type == RESERVED_BLOCKS) { spin_lock(&sbi->stat_lock); @@ -278,10 +280,16 @@ out: return count; } - *ui = t; - if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) - f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "iostat_enable")) { + sbi->iostat_enable = !!t; + if (!sbi->iostat_enable) + f2fs_reset_iostat(sbi); + return count; + } + + *ui = (unsigned int)t; + return count; } @@ -418,6 +426,8 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, interval_time[DISCARD_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, + umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -475,6 +485,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(idle_interval), ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), + ATTR_LIST(umount_discard_timeout), ATTR_LIST(iostat_enable), ATTR_LIST(readdir_ra), ATTR_LIST(gc_pin_file_thresh), diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index ce2a5eb210b6..d0ab533a9ce8 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -14,7 +14,7 @@ #include "trace.h" static RADIX_TREE(pids, GFP_ATOMIC); -static struct mutex pids_lock; +static spinlock_t pids_lock; static struct last_io_info last_io; static inline void __print_last_io(void) @@ -58,23 +58,29 @@ void f2fs_trace_pid(struct page *page) set_page_private(page, (unsigned long)pid); +retry: if (radix_tree_preload(GFP_NOFS)) return; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); p = radix_tree_lookup(&pids, pid); if (p == current) goto out; if (p) radix_tree_delete(&pids, pid); - f2fs_radix_tree_insert(&pids, pid, current); + if (radix_tree_insert(&pids, pid, current)) { + spin_unlock(&pids_lock); + radix_tree_preload_end(); + cond_resched(); + goto retry; + } trace_printk("%3x:%3x %4x %-16s\n", MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), pid, current->comm); out: - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); radix_tree_preload_end(); } @@ -119,7 +125,7 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) void f2fs_build_trace_ios(void) { - mutex_init(&pids_lock); + spin_lock_init(&pids_lock); } #define PIDVEC_SIZE 128 @@ -147,7 +153,7 @@ void f2fs_destroy_trace_ios(void) pid_t next_pid = 0; unsigned int found; - mutex_lock(&pids_lock); + spin_lock(&pids_lock); while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) { unsigned idx; @@ -155,5 +161,5 @@ void f2fs_destroy_trace_ios(void) for (idx = 0; idx < found; idx++) radix_tree_delete(&pids, pid[idx]); } - mutex_unlock(&pids_lock); + spin_unlock(&pids_lock); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 18d5ffbc5e8c..848a785abe25 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -224,11 +224,11 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, { struct f2fs_xattr_entry *entry; unsigned int inline_size = inline_xattr_size(inode); + void *max_addr = base_addr + inline_size; list_for_each_xattr(entry, base_addr) { - if ((void *)entry + sizeof(__u32) > base_addr + inline_size || - (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > - base_addr + inline_size) { + if ((void *)entry + sizeof(__u32) > max_addr || + (void *)XATTR_NEXT_ENTRY(entry) > max_addr) { *last_addr = entry; return NULL; } @@ -239,6 +239,13 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, if (!memcmp(entry->e_name, name, len)) break; } + + /* inline xattr header or entry across max inline xattr size */ + if (IS_XATTR_LAST_ENTRY(entry) && + (void *)entry + sizeof(__u32) > max_addr) { + *last_addr = entry; + return NULL; + } return entry; } @@ -340,7 +347,7 @@ check: *base_addr = txattr_addr; return 0; out: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -383,7 +390,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, *base_addr = txattr_addr; return 0; fail: - kzfree(txattr_addr); + kvfree(txattr_addr); return err; } @@ -510,7 +517,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, } error = size; out: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -538,7 +545,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (!handler || (handler->list && !handler->list(dentry))) continue; - prefix = handler->prefix ?: handler->name; + prefix = xattr_prefix(handler); prefix_len = strlen(prefix); size = prefix_len + entry->e_name_len + 1; if (buffer) { @@ -556,7 +563,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) } error = buffer_size - rest; cleanup: - kzfree(base_addr); + kvfree(base_addr); return error; } @@ -687,7 +694,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: - kzfree(base_addr); + kvfree(base_addr); return error; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 67db134da0f5..9172ee082ca8 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -78,6 +78,12 @@ struct f2fs_xattr_entry { sizeof(struct f2fs_xattr_header) - \ sizeof(struct f2fs_xattr_entry)) +#define MAX_INLINE_XATTR_SIZE \ + (DEF_ADDRS_PER_INODE - \ + F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ + DEF_INLINE_RESERVED_SIZE - \ + MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) + /* * On-disk structure of f2fs_xattr * We use inline xattrs space + 1 block for xattr. diff --git a/fs/filesystems.c b/fs/filesystems.c index b03f57b1105b..9135646e41ac 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> +#include <linux/fs_parser.h> /* * Handling of filesystem drivers list. @@ -73,6 +74,9 @@ int register_filesystem(struct file_system_type * fs) int res = 0; struct file_system_type ** p; + if (fs->parameters && !fs_validate_description(fs->parameters)) + return -EINVAL; + BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; diff --git a/fs/fs_context.c b/fs/fs_context.c new file mode 100644 index 000000000000..87e3546b9a52 --- /dev/null +++ b/fs/fs_context.c @@ -0,0 +1,642 @@ +/* Provide a way to create a superblock configuration context within the kernel + * that allows a superblock to be set up prior to mounting. + * + * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/nsproxy.h> +#include <linux/slab.h> +#include <linux/magic.h> +#include <linux/security.h> +#include <linux/mnt_namespace.h> +#include <linux/pid_namespace.h> +#include <linux/user_namespace.h> +#include <net/net_namespace.h> +#include "mount.h" +#include "internal.h" + +enum legacy_fs_param { + LEGACY_FS_UNSET_PARAMS, + LEGACY_FS_MONOLITHIC_PARAMS, + LEGACY_FS_INDIVIDUAL_PARAMS, +}; + +struct legacy_fs_context { + char *legacy_data; /* Data page for legacy filesystems */ + size_t data_size; + enum legacy_fs_param param_type; +}; + +static int legacy_init_fs_context(struct fs_context *fc); + +static const struct constant_table common_set_sb_flag[] = { + { "dirsync", SB_DIRSYNC }, + { "lazytime", SB_LAZYTIME }, + { "mand", SB_MANDLOCK }, + { "posixacl", SB_POSIXACL }, + { "ro", SB_RDONLY }, + { "sync", SB_SYNCHRONOUS }, +}; + +static const struct constant_table common_clear_sb_flag[] = { + { "async", SB_SYNCHRONOUS }, + { "nolazytime", SB_LAZYTIME }, + { "nomand", SB_MANDLOCK }, + { "rw", SB_RDONLY }, + { "silent", SB_SILENT }, +}; + +static const char *const forbidden_sb_flag[] = { + "bind", + "dev", + "exec", + "move", + "noatime", + "nodev", + "nodiratime", + "noexec", + "norelatime", + "nostrictatime", + "nosuid", + "private", + "rec", + "relatime", + "remount", + "shared", + "slave", + "strictatime", + "suid", + "unbindable", +}; + +/* + * Check for a common mount option that manipulates s_flags. + */ +static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) +{ + unsigned int token; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(forbidden_sb_flag); i++) + if (strcmp(key, forbidden_sb_flag[i]) == 0) + return -EINVAL; + + token = lookup_constant(common_set_sb_flag, key, 0); + if (token) { + fc->sb_flags |= token; + fc->sb_flags_mask |= token; + return 0; + } + + token = lookup_constant(common_clear_sb_flag, key, 0); + if (token) { + fc->sb_flags &= ~token; + fc->sb_flags_mask |= token; + return 0; + } + + return -ENOPARAM; +} + +/** + * vfs_parse_fs_param - Add a single parameter to a superblock config + * @fc: The filesystem context to modify + * @param: The parameter + * + * A single mount option in string form is applied to the filesystem context + * being set up. Certain standard options (for example "ro") are translated + * into flag bits without going to the filesystem. The active security module + * is allowed to observe and poach options. Any other options are passed over + * to the filesystem to parse. + * + * This may be called multiple times for a context. + * + * Returns 0 on success and a negative error code on failure. In the event of + * failure, supplementary error information may have been set. + */ +int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) +{ + int ret; + + if (!param->key) + return invalf(fc, "Unnamed parameter\n"); + + ret = vfs_parse_sb_flag(fc, param->key); + if (ret != -ENOPARAM) + return ret; + + ret = security_fs_context_parse_param(fc, param); + if (ret != -ENOPARAM) + /* Param belongs to the LSM or is disallowed by the LSM; so + * don't pass to the FS. + */ + return ret; + + if (fc->ops->parse_param) { + ret = fc->ops->parse_param(fc, param); + if (ret != -ENOPARAM) + return ret; + } + + /* If the filesystem doesn't take any arguments, give it the + * default handling of source. + */ + if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Non-string source"); + if (fc->source) + return invalf(fc, "VFS: Multiple sources"); + fc->source = param->string; + param->string = NULL; + return 0; + } + + return invalf(fc, "%s: Unknown parameter '%s'", + fc->fs_type->name, param->key); +} +EXPORT_SYMBOL(vfs_parse_fs_param); + +/** + * vfs_parse_fs_string - Convenience function to just parse a string. + */ +int vfs_parse_fs_string(struct fs_context *fc, const char *key, + const char *value, size_t v_size) +{ + int ret; + + struct fs_parameter param = { + .key = key, + .type = fs_value_is_string, + .size = v_size, + }; + + if (v_size > 0) { + param.string = kmemdup_nul(value, v_size, GFP_KERNEL); + if (!param.string) + return -ENOMEM; + } + + ret = vfs_parse_fs_param(fc, ¶m); + kfree(param.string); + return ret; +} +EXPORT_SYMBOL(vfs_parse_fs_string); + +/** + * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * @ctx: The superblock configuration to fill in. + * @data: The data to parse + * + * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be + * called from the ->monolithic_mount_data() fs_context operation. + * + * Returns 0 on success or the error returned by the ->parse_option() fs_context + * operation on failure. + */ +int generic_parse_monolithic(struct fs_context *fc, void *data) +{ + char *options = data, *key; + int ret = 0; + + if (!options) + return 0; + + ret = security_sb_eat_lsm_opts(options, &fc->security); + if (ret) + return ret; + + while ((key = strsep(&options, ",")) != NULL) { + if (*key) { + size_t v_len = 0; + char *value = strchr(key, '='); + + if (value) { + if (value == key) + continue; + *value++ = 0; + v_len = strlen(value); + } + ret = vfs_parse_fs_string(fc, key, value, v_len); + if (ret < 0) + break; + } + } + + return ret; +} +EXPORT_SYMBOL(generic_parse_monolithic); + +/** + * alloc_fs_context - Create a filesystem context. + * @fs_type: The filesystem type. + * @reference: The dentry from which this one derives (or NULL) + * @sb_flags: Filesystem/superblock flags (SB_*) + * @sb_flags_mask: Applicable members of @sb_flags + * @purpose: The purpose that this configuration shall be used for. + * + * Open a filesystem and create a mount context. The mount context is + * initialised with the supplied flags and, if a submount/automount from + * another superblock (referred to by @reference) is supplied, may have + * parameters such as namespaces copied across from that superblock. + */ +static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, + struct dentry *reference, + unsigned int sb_flags, + unsigned int sb_flags_mask, + enum fs_context_purpose purpose) +{ + int (*init_fs_context)(struct fs_context *); + struct fs_context *fc; + int ret = -ENOMEM; + + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->purpose = purpose; + fc->sb_flags = sb_flags; + fc->sb_flags_mask = sb_flags_mask; + fc->fs_type = get_filesystem(fs_type); + fc->cred = get_current_cred(); + fc->net_ns = get_net(current->nsproxy->net_ns); + + switch (purpose) { + case FS_CONTEXT_FOR_MOUNT: + fc->user_ns = get_user_ns(fc->cred->user_ns); + break; + case FS_CONTEXT_FOR_SUBMOUNT: + fc->user_ns = get_user_ns(reference->d_sb->s_user_ns); + break; + case FS_CONTEXT_FOR_RECONFIGURE: + /* We don't pin any namespaces as the superblock's + * subscriptions cannot be changed at this point. + */ + atomic_inc(&reference->d_sb->s_active); + fc->root = dget(reference); + break; + } + + /* TODO: Make all filesystems support this unconditionally */ + init_fs_context = fc->fs_type->init_fs_context; + if (!init_fs_context) + init_fs_context = legacy_init_fs_context; + + ret = init_fs_context(fc); + if (ret < 0) + goto err_fc; + fc->need_free = true; + return fc; + +err_fc: + put_fs_context(fc); + return ERR_PTR(ret); +} + +struct fs_context *fs_context_for_mount(struct file_system_type *fs_type, + unsigned int sb_flags) +{ + return alloc_fs_context(fs_type, NULL, sb_flags, 0, + FS_CONTEXT_FOR_MOUNT); +} +EXPORT_SYMBOL(fs_context_for_mount); + +struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, + unsigned int sb_flags, + unsigned int sb_flags_mask) +{ + return alloc_fs_context(dentry->d_sb->s_type, dentry, sb_flags, + sb_flags_mask, FS_CONTEXT_FOR_RECONFIGURE); +} +EXPORT_SYMBOL(fs_context_for_reconfigure); + +struct fs_context *fs_context_for_submount(struct file_system_type *type, + struct dentry *reference) +{ + return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); +} +EXPORT_SYMBOL(fs_context_for_submount); + +void fc_drop_locked(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + dput(fc->root); + fc->root = NULL; + deactivate_locked_super(sb); +} + +static void legacy_fs_context_free(struct fs_context *fc); + +/** + * vfs_dup_fc_config: Duplicate a filesystem context. + * @src_fc: The context to copy. + */ +struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) +{ + struct fs_context *fc; + int ret; + + if (!src_fc->ops->dup) + return ERR_PTR(-EOPNOTSUPP); + + fc = kmemdup(src_fc, sizeof(struct fs_context), GFP_KERNEL); + if (!fc) + return ERR_PTR(-ENOMEM); + + fc->fs_private = NULL; + fc->s_fs_info = NULL; + fc->source = NULL; + fc->security = NULL; + get_filesystem(fc->fs_type); + get_net(fc->net_ns); + get_user_ns(fc->user_ns); + get_cred(fc->cred); + + /* Can't call put until we've called ->dup */ + ret = fc->ops->dup(fc, src_fc); + if (ret < 0) + goto err_fc; + + ret = security_fs_context_dup(fc, src_fc); + if (ret < 0) + goto err_fc; + return fc; + +err_fc: + put_fs_context(fc); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(vfs_dup_fs_context); + +#ifdef CONFIG_PRINTK +/** + * logfc - Log a message to a filesystem context + * @fc: The filesystem context to log to. + * @fmt: The format of the buffer. + */ +void logfc(struct fs_context *fc, const char *fmt, ...) +{ + va_list va; + + va_start(va, fmt); + + switch (fmt[0]) { + case 'w': + vprintk_emit(0, LOGLEVEL_WARNING, NULL, 0, fmt, va); + break; + case 'e': + vprintk_emit(0, LOGLEVEL_ERR, NULL, 0, fmt, va); + break; + default: + vprintk_emit(0, LOGLEVEL_NOTICE, NULL, 0, fmt, va); + break; + } + + pr_cont("\n"); + va_end(va); +} +EXPORT_SYMBOL(logfc); +#endif + +/** + * put_fs_context - Dispose of a superblock configuration context. + * @fc: The context to dispose of. + */ +void put_fs_context(struct fs_context *fc) +{ + struct super_block *sb; + + if (fc->root) { + sb = fc->root->d_sb; + dput(fc->root); + fc->root = NULL; + deactivate_super(sb); + } + + if (fc->need_free && fc->ops && fc->ops->free) + fc->ops->free(fc); + + security_free_mnt_opts(&fc->security); + put_net(fc->net_ns); + put_user_ns(fc->user_ns); + put_cred(fc->cred); + kfree(fc->subtype); + put_filesystem(fc->fs_type); + kfree(fc->source); + kfree(fc); +} +EXPORT_SYMBOL(put_fs_context); + +/* + * Free the config for a filesystem that doesn't support fs_context. + */ +static void legacy_fs_context_free(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + + if (ctx) { + if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) + kfree(ctx->legacy_data); + kfree(ctx); + } +} + +/* + * Duplicate a legacy config. + */ +static int legacy_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) +{ + struct legacy_fs_context *ctx; + struct legacy_fs_context *src_ctx = src_fc->fs_private; + + ctx = kmemdup(src_ctx, sizeof(*src_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (ctx->param_type == LEGACY_FS_INDIVIDUAL_PARAMS) { + ctx->legacy_data = kmemdup(src_ctx->legacy_data, + src_ctx->data_size, GFP_KERNEL); + if (!ctx->legacy_data) { + kfree(ctx); + return -ENOMEM; + } + } + + fc->fs_private = ctx; + return 0; +} + +/* + * Add a parameter to a legacy config. We build up a comma-separated list of + * options. + */ +static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct legacy_fs_context *ctx = fc->fs_private; + unsigned int size = ctx->data_size; + size_t len = 0; + + if (strcmp(param->key, "source") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Legacy: Non-string source"); + if (fc->source) + return invalf(fc, "VFS: Legacy: Multiple sources"); + fc->source = param->string; + param->string = NULL; + return 0; + } + + if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) && + strcmp(param->key, "subtype") == 0) { + if (param->type != fs_value_is_string) + return invalf(fc, "VFS: Legacy: Non-string subtype"); + if (fc->subtype) + return invalf(fc, "VFS: Legacy: Multiple subtype"); + fc->subtype = param->string; + param->string = NULL; + return 0; + } + + if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) + return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); + + switch (param->type) { + case fs_value_is_string: + len = 1 + param->size; + /* Fall through */ + case fs_value_is_flag: + len += strlen(param->key); + break; + default: + return invalf(fc, "VFS: Legacy: Parameter type for '%s' not supported", + param->key); + } + + if (len > PAGE_SIZE - 2 - size) + return invalf(fc, "VFS: Legacy: Cumulative options too large"); + if (strchr(param->key, ',') || + (param->type == fs_value_is_string && + memchr(param->string, ',', param->size))) + return invalf(fc, "VFS: Legacy: Option '%s' contained comma", + param->key); + if (!ctx->legacy_data) { + ctx->legacy_data = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!ctx->legacy_data) + return -ENOMEM; + } + + ctx->legacy_data[size++] = ','; + len = strlen(param->key); + memcpy(ctx->legacy_data + size, param->key, len); + size += len; + if (param->type == fs_value_is_string) { + ctx->legacy_data[size++] = '='; + memcpy(ctx->legacy_data + size, param->string, param->size); + size += param->size; + } + ctx->legacy_data[size] = '\0'; + ctx->data_size = size; + ctx->param_type = LEGACY_FS_INDIVIDUAL_PARAMS; + return 0; +} + +/* + * Add monolithic mount data. + */ +static int legacy_parse_monolithic(struct fs_context *fc, void *data) +{ + struct legacy_fs_context *ctx = fc->fs_private; + + if (ctx->param_type != LEGACY_FS_UNSET_PARAMS) { + pr_warn("VFS: Can't mix monolithic and individual options\n"); + return -EINVAL; + } + + ctx->legacy_data = data; + ctx->param_type = LEGACY_FS_MONOLITHIC_PARAMS; + if (!ctx->legacy_data) + return 0; + + if (fc->fs_type->fs_flags & FS_BINARY_MOUNTDATA) + return 0; + return security_sb_eat_lsm_opts(ctx->legacy_data, &fc->security); +} + +/* + * Get a mountable root with the legacy mount command. + */ +static int legacy_get_tree(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + struct super_block *sb; + struct dentry *root; + + root = fc->fs_type->mount(fc->fs_type, fc->sb_flags, + fc->source, ctx->legacy_data); + if (IS_ERR(root)) + return PTR_ERR(root); + + sb = root->d_sb; + BUG_ON(!sb); + + fc->root = root; + return 0; +} + +/* + * Handle remount. + */ +static int legacy_reconfigure(struct fs_context *fc) +{ + struct legacy_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + + if (!sb->s_op->remount_fs) + return 0; + + return sb->s_op->remount_fs(sb, &fc->sb_flags, + ctx ? ctx->legacy_data : NULL); +} + +const struct fs_context_operations legacy_fs_context_ops = { + .free = legacy_fs_context_free, + .dup = legacy_fs_context_dup, + .parse_param = legacy_parse_param, + .parse_monolithic = legacy_parse_monolithic, + .get_tree = legacy_get_tree, + .reconfigure = legacy_reconfigure, +}; + +/* + * Initialise a legacy context for a filesystem that doesn't support + * fs_context. + */ +static int legacy_init_fs_context(struct fs_context *fc) +{ + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + if (!fc->fs_private) + return -ENOMEM; + fc->ops = &legacy_fs_context_ops; + return 0; +} + +int parse_monolithic_mount_data(struct fs_context *fc, void *data) +{ + int (*monolithic_mount_data)(struct fs_context *, void *); + + monolithic_mount_data = fc->ops->parse_monolithic; + if (!monolithic_mount_data) + monolithic_mount_data = generic_parse_monolithic; + + return monolithic_mount_data(fc, data); +} diff --git a/fs/fs_parser.c b/fs/fs_parser.c new file mode 100644 index 000000000000..842e8f749db6 --- /dev/null +++ b/fs/fs_parser.c @@ -0,0 +1,447 @@ +/* Filesystem parameter parser. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/export.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/slab.h> +#include <linux/security.h> +#include <linux/namei.h> +#include "internal.h" + +static const struct constant_table bool_names[] = { + { "0", false }, + { "1", true }, + { "false", false }, + { "no", false }, + { "true", true }, + { "yes", true }, +}; + +/** + * lookup_constant - Look up a constant by name in an ordered table + * @tbl: The table of constants to search. + * @tbl_size: The size of the table. + * @name: The name to look up. + * @not_found: The value to return if the name is not found. + */ +int __lookup_constant(const struct constant_table *tbl, size_t tbl_size, + const char *name, int not_found) +{ + unsigned int i; + + for (i = 0; i < tbl_size; i++) + if (strcmp(name, tbl[i].name) == 0) + return tbl[i].value; + + return not_found; +} +EXPORT_SYMBOL(__lookup_constant); + +static const struct fs_parameter_spec *fs_lookup_key( + const struct fs_parameter_description *desc, + const char *name) +{ + const struct fs_parameter_spec *p; + + if (!desc->specs) + return NULL; + + for (p = desc->specs; p->name; p++) + if (strcmp(p->name, name) == 0) + return p; + + return NULL; +} + +/* + * fs_parse - Parse a filesystem configuration parameter + * @fc: The filesystem context to log errors through. + * @desc: The parameter description to use. + * @param: The parameter. + * @result: Where to place the result of the parse + * + * Parse a filesystem configuration parameter and attempt a conversion for a + * simple parameter for which this is requested. If successful, the determined + * parameter ID is placed into @result->key, the desired type is indicated in + * @result->t and any converted value is placed into an appropriate member of + * the union in @result. + * + * The function returns the parameter number if the parameter was matched, + * -ENOPARAM if it wasn't matched and @desc->ignore_unknown indicated that + * unknown parameters are okay and -EINVAL if there was a conversion issue or + * the parameter wasn't recognised and unknowns aren't okay. + */ +int fs_parse(struct fs_context *fc, + const struct fs_parameter_description *desc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + const struct fs_parameter_spec *p; + const struct fs_parameter_enum *e; + int ret = -ENOPARAM, b; + + result->has_value = !!param->string; + result->negated = false; + result->uint_64 = 0; + + p = fs_lookup_key(desc, param->key); + if (!p) { + /* If we didn't find something that looks like "noxxx", see if + * "xxx" takes the "no"-form negative - but only if there + * wasn't an value. + */ + if (result->has_value) + goto unknown_parameter; + if (param->key[0] != 'n' || param->key[1] != 'o' || !param->key[2]) + goto unknown_parameter; + + p = fs_lookup_key(desc, param->key + 2); + if (!p) + goto unknown_parameter; + if (!(p->flags & fs_param_neg_with_no)) + goto unknown_parameter; + result->boolean = false; + result->negated = true; + } + + if (p->flags & fs_param_deprecated) + warnf(fc, "%s: Deprecated parameter '%s'", + desc->name, param->key); + + if (result->negated) + goto okay; + + /* Certain parameter types only take a string and convert it. */ + switch (p->type) { + case __fs_param_wasnt_defined: + return -EINVAL; + case fs_param_is_u32: + case fs_param_is_u32_octal: + case fs_param_is_u32_hex: + case fs_param_is_s32: + case fs_param_is_u64: + case fs_param_is_enum: + case fs_param_is_string: + if (param->type != fs_value_is_string) + goto bad_value; + if (!result->has_value) { + if (p->flags & fs_param_v_optional) + goto okay; + goto bad_value; + } + /* Fall through */ + default: + break; + } + + /* Try to turn the type we were given into the type desired by the + * parameter and give an error if we can't. + */ + switch (p->type) { + case fs_param_is_flag: + if (param->type != fs_value_is_flag && + (param->type != fs_value_is_string || result->has_value)) + return invalf(fc, "%s: Unexpected value for '%s'", + desc->name, param->key); + result->boolean = true; + goto okay; + + case fs_param_is_bool: + switch (param->type) { + case fs_value_is_flag: + result->boolean = true; + goto okay; + case fs_value_is_string: + if (param->size == 0) { + result->boolean = true; + goto okay; + } + b = lookup_constant(bool_names, param->string, -1); + if (b == -1) + goto bad_value; + result->boolean = b; + goto okay; + default: + goto bad_value; + } + + case fs_param_is_u32: + ret = kstrtouint(param->string, 0, &result->uint_32); + goto maybe_okay; + case fs_param_is_u32_octal: + ret = kstrtouint(param->string, 8, &result->uint_32); + goto maybe_okay; + case fs_param_is_u32_hex: + ret = kstrtouint(param->string, 16, &result->uint_32); + goto maybe_okay; + case fs_param_is_s32: + ret = kstrtoint(param->string, 0, &result->int_32); + goto maybe_okay; + case fs_param_is_u64: + ret = kstrtoull(param->string, 0, &result->uint_64); + goto maybe_okay; + + case fs_param_is_enum: + for (e = desc->enums; e->name[0]; e++) { + if (e->opt == p->opt && + strcmp(e->name, param->string) == 0) { + result->uint_32 = e->value; + goto okay; + } + } + goto bad_value; + + case fs_param_is_string: + goto okay; + case fs_param_is_blob: + if (param->type != fs_value_is_blob) + goto bad_value; + goto okay; + + case fs_param_is_fd: { + if (param->type != fs_value_is_file) + goto bad_value; + goto okay; + } + + case fs_param_is_blockdev: + case fs_param_is_path: + goto okay; + default: + BUG(); + } + +maybe_okay: + if (ret < 0) + goto bad_value; +okay: + return p->opt; + +bad_value: + return invalf(fc, "%s: Bad value for '%s'", desc->name, param->key); +unknown_parameter: + return -ENOPARAM; +} +EXPORT_SYMBOL(fs_parse); + +/** + * fs_lookup_param - Look up a path referred to by a parameter + * @fc: The filesystem context to log errors through. + * @param: The parameter. + * @want_bdev: T if want a blockdev + * @_path: The result of the lookup + */ +int fs_lookup_param(struct fs_context *fc, + struct fs_parameter *param, + bool want_bdev, + struct path *_path) +{ + struct filename *f; + unsigned int flags = 0; + bool put_f; + int ret; + + switch (param->type) { + case fs_value_is_string: + f = getname_kernel(param->string); + if (IS_ERR(f)) + return PTR_ERR(f); + put_f = true; + break; + case fs_value_is_filename_empty: + flags = LOOKUP_EMPTY; + /* Fall through */ + case fs_value_is_filename: + f = param->name; + put_f = false; + break; + default: + return invalf(fc, "%s: not usable as path", param->key); + } + + ret = filename_lookup(param->dirfd, f, flags, _path, NULL); + if (ret < 0) { + errorf(fc, "%s: Lookup failure for '%s'", param->key, f->name); + goto out; + } + + if (want_bdev && + !S_ISBLK(d_backing_inode(_path->dentry)->i_mode)) { + path_put(_path); + _path->dentry = NULL; + _path->mnt = NULL; + errorf(fc, "%s: Non-blockdev passed as '%s'", + param->key, f->name); + ret = -ENOTBLK; + } + +out: + if (put_f) + putname(f); + return ret; +} +EXPORT_SYMBOL(fs_lookup_param); + +#ifdef CONFIG_VALIDATE_FS_PARSER +/** + * validate_constant_table - Validate a constant table + * @name: Name to use in reporting + * @tbl: The constant table to validate. + * @tbl_size: The size of the table. + * @low: The lowest permissible value. + * @high: The highest permissible value. + * @special: One special permissible value outside of the range. + */ +bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, + int low, int high, int special) +{ + size_t i; + bool good = true; + + if (tbl_size == 0) { + pr_warn("VALIDATE C-TBL: Empty\n"); + return true; + } + + for (i = 0; i < tbl_size; i++) { + if (!tbl[i].name) { + pr_err("VALIDATE C-TBL[%zu]: Null\n", i); + good = false; + } else if (i > 0 && tbl[i - 1].name) { + int c = strcmp(tbl[i-1].name, tbl[i].name); + + if (c == 0) { + pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n", + i, tbl[i].name); + good = false; + } + if (c > 0) { + pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n", + i, tbl[i-1].name, tbl[i].name); + good = false; + } + } + + if (tbl[i].value != special && + (tbl[i].value < low || tbl[i].value > high)) { + pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n", + i, tbl[i].name, tbl[i].value, low, high); + good = false; + } + } + + return good; +} + +/** + * fs_validate_description - Validate a parameter description + * @desc: The parameter description to validate. + */ +bool fs_validate_description(const struct fs_parameter_description *desc) +{ + const struct fs_parameter_spec *param, *p2; + const struct fs_parameter_enum *e; + const char *name = desc->name; + unsigned int nr_params = 0; + bool good = true, enums = false; + + pr_notice("*** VALIDATE %s ***\n", name); + + if (!name[0]) { + pr_err("VALIDATE Parser: No name\n"); + name = "Unknown"; + good = false; + } + + if (desc->specs) { + for (param = desc->specs; param->name; param++) { + enum fs_parameter_type t = param->type; + + /* Check that the type is in range */ + if (t == __fs_param_wasnt_defined || + t >= nr__fs_parameter_type) { + pr_err("VALIDATE %s: PARAM[%s] Bad type %u\n", + name, param->name, t); + good = false; + } else if (t == fs_param_is_enum) { + enums = true; + } + + /* Check for duplicate parameter names */ + for (p2 = desc->specs; p2 < param; p2++) { + if (strcmp(param->name, p2->name) == 0) { + pr_err("VALIDATE %s: PARAM[%s]: Duplicate\n", + name, param->name); + good = false; + } + } + } + + nr_params = param - desc->specs; + } + + if (desc->enums) { + if (!nr_params) { + pr_err("VALIDATE %s: Enum table but no parameters\n", + name); + good = false; + goto no_enums; + } + if (!enums) { + pr_err("VALIDATE %s: Enum table but no enum-type values\n", + name); + good = false; + goto no_enums; + } + + for (e = desc->enums; e->name[0]; e++) { + /* Check that all entries in the enum table have at + * least one parameter that uses them. + */ + for (param = desc->specs; param->name; param++) { + if (param->opt == e->opt && + param->type != fs_param_is_enum) { + pr_err("VALIDATE %s: e[%lu] enum val for %s\n", + name, e - desc->enums, param->name); + good = false; + } + } + } + + /* Check that all enum-type parameters have at least one enum + * value in the enum table. + */ + for (param = desc->specs; param->name; param++) { + if (param->type != fs_param_is_enum) + continue; + for (e = desc->enums; e->name[0]; e++) + if (e->opt == param->opt) + break; + if (!e->name[0]) { + pr_err("VALIDATE %s: PARAM[%s] enum with no values\n", + name, param->name); + good = false; + } + } + } else { + if (enums) { + pr_err("VALIDATE %s: enum-type values, but no enum table\n", + name); + good = false; + goto no_enums; + } + } + +no_enums: + return good; +} +#endif /* CONFIG_VALIDATE_FS_PARSER */ diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 989df5accaee..fe80bea4ad89 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -35,7 +35,9 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fuse_abort_conn(fc, true); + if (fc->abort_err) + fc->aborted = true; + fuse_abort_conn(fc); fuse_conn_put(fc); } return count; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 8f68181256c0..55a26f351467 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -141,10 +141,11 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_sync_release(ff, file->f_flags); + fuse_sync_release(fi, ff, file->f_flags); fuse_conn_put(fc); return 0; @@ -407,7 +408,7 @@ err_unlock: err_region: unregister_chrdev_region(devt, 1); err: - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); goto out; } @@ -586,7 +587,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, { struct cuse_conn *cc = dev_get_drvdata(dev); - fuse_abort_conn(&cc->fc, false); + fuse_abort_conn(&cc->fc); return count; } static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 809c0f2f9942..8a63e52785e9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -251,17 +251,18 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc, struct file *file) { struct fuse_req *req = NULL; + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; do { wait_event(fc->reserved_req_waitq, ff->reserved_req); - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (ff->reserved_req) { req = ff->reserved_req; ff->reserved_req = NULL; req->stolen_file = get_file(file); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } while (!req); return req; @@ -273,16 +274,17 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc, static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) { struct file *file = req->stolen_file; + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; WARN_ON(req->max_pages); - spin_lock(&fc->lock); + spin_lock(&fi->lock); memset(req, 0, sizeof(*req)); fuse_request_init(req, NULL, NULL, 0); BUG_ON(ff->reserved_req); ff->reserved_req = req; wake_up_all(&fc->reserved_req_waitq); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fput(file); } @@ -431,10 +433,16 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; - - spin_lock(&fiq->waitq.lock); - list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + /* + * test_and_set_bit() implies smp_mb() between bit + * changing and below intr_entry check. Pairs with + * smp_mb() from queue_interrupt(). + */ + if (!list_empty(&req->intr_entry)) { + spin_lock(&fiq->waitq.lock); + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + } WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { @@ -462,27 +470,43 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) fc->active_background--; flush_bg_queue(fc); spin_unlock(&fc->bg_lock); + } else { + /* Wake up waiter sleeping in request_wait_answer() */ + wake_up(&req->waitq); } - wake_up(&req->waitq); + if (req->end) req->end(fc, req); put_request: fuse_put_request(fc, req); } -static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->waitq.lock); - if (test_bit(FR_FINISHED, &req->flags)) { + /* Check for we've sent request to interrupt this req */ + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { spin_unlock(&fiq->waitq.lock); - return; + return -EINVAL; } + if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + return 0; + } wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } spin_unlock(&fiq->waitq.lock); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + return 0; } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) @@ -1306,7 +1330,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, goto err_unlock; if (!fiq->connected) { - err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; + err = fc->aborted ? -ECONNABORTED : -ENODEV; goto err_unlock; } @@ -1353,7 +1377,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) { - err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; + err = fc->aborted ? -ECONNABORTED : -ENODEV; goto out_end; } if (err) { @@ -1900,16 +1924,17 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; + err = -EINVAL; if (nbytes < sizeof(struct fuse_out_header)) - return -EINVAL; + goto out; err = fuse_copy_one(cs, &oh, sizeof(oh)); if (err) - goto err_finish; + goto copy_finish; err = -EINVAL; if (oh.len != nbytes) - goto err_finish; + goto copy_finish; /* * Zero oh.unique indicates unsolicited notification message @@ -1917,41 +1942,40 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); - return err ? err : nbytes; + goto out; } err = -EINVAL; if (oh.error <= -1000 || oh.error > 0) - goto err_finish; + goto copy_finish; spin_lock(&fpq->lock); - err = -ENOENT; - if (!fpq->connected) - goto err_unlock_pq; + req = NULL; + if (fpq->connected) + req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); - req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); - if (!req) - goto err_unlock_pq; + err = -ENOENT; + if (!req) { + spin_unlock(&fpq->lock); + goto copy_finish; + } /* Is it an interrupt reply ID? */ if (oh.unique & FUSE_INT_REQ_BIT) { __fuse_get_request(req); spin_unlock(&fpq->lock); - err = -EINVAL; - if (nbytes != sizeof(struct fuse_out_header)) { - fuse_put_request(fc, req); - goto err_finish; - } - - if (oh.error == -ENOSYS) + err = 0; + if (nbytes != sizeof(struct fuse_out_header)) + err = -EINVAL; + else if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - queue_interrupt(&fc->iq, req); + err = queue_interrupt(&fc->iq, req); + fuse_put_request(fc, req); - fuse_copy_finish(cs); - return nbytes; + goto copy_finish; } clear_bit(FR_SENT, &req->flags); @@ -1977,14 +2001,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); request_end(fc, req); - +out: return err ? err : nbytes; - err_unlock_pq: - spin_unlock(&fpq->lock); - err_finish: +copy_finish: fuse_copy_finish(cs); - return err; + goto out; } static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) @@ -2109,11 +2131,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) return mask; } -/* - * Abort all requests on the given list (pending or processing) - * - * This function releases and reacquires fc->lock - */ +/* Abort all requests on the given list (pending or processing) */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { while (!list_empty(head)) { @@ -2159,7 +2177,7 @@ static void end_polls(struct fuse_conn *fc) * is OK, the request will in that case be removed from the list before we touch * it. */ -void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) +void fuse_abort_conn(struct fuse_conn *fc) { struct fuse_iqueue *fiq = &fc->iq; @@ -2175,7 +2193,6 @@ void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) fc->connected = 0; spin_unlock(&fc->bg_lock); - fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; @@ -2253,7 +2270,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { WARN_ON(fc->iq.fasync != NULL); - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); } fuse_dev_free(fud); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e909678afa2d..dd0f64f7bc06 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -149,21 +149,6 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, args->out.args[0].value = outarg; } -u64 fuse_get_attr_version(struct fuse_conn *fc) -{ - u64 curr_version; - - /* - * The spin lock isn't actually needed on 64bit archs, but we - * don't yet care too much about such optimizations. - */ - spin_lock(&fc->lock); - curr_version = fc->attr_version; - spin_unlock(&fc->lock); - - return curr_version; -} - /* * Check whether the dentry is still valid * @@ -222,9 +207,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) fuse_queue_forget(fc, forget, outarg.nodeid, 1); goto invalid; } - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } kfree(forget); if (ret == -ENOMEM) @@ -400,6 +385,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_create_in inarg; struct fuse_open_out outopen; struct fuse_entry_out outentry; + struct fuse_inode *fi; struct fuse_file *ff; /* Userspace expects S_IFREG in create mode */ @@ -451,7 +437,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, &outentry.attr, entry_attr_timeout(&outentry), 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); - fuse_sync_release(ff, flags); + fuse_sync_release(NULL, ff, flags); fuse_queue_forget(fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; @@ -462,7 +448,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, fuse_dir_changed(dir); err = finish_open(file, entry, generic_file_open); if (err) { - fuse_sync_release(ff, flags); + fi = get_fuse_inode(inode); + fuse_sync_release(fi, ff, flags); } else { file->private_data = ff; fuse_finish_open(inode, file); @@ -671,8 +658,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be @@ -681,7 +668,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) */ if (inode->i_nlink > 0) drop_nlink(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_dir_changed(dir); fuse_invalidate_entry_cache(entry); @@ -825,10 +812,10 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, if (!err) { struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); inc_nlink(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); fuse_update_ctime(inode); } else if (err == -EINTR) { @@ -1356,15 +1343,14 @@ static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, */ void fuse_set_nowrite(struct inode *inode) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!inode_is_locked(inode)); - spin_lock(&fc->lock); + spin_lock(&fi->lock); BUG_ON(fi->writectr < 0); fi->writectr += FUSE_NOWRITE; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); } @@ -1385,11 +1371,11 @@ static void __fuse_release_nowrite(struct inode *inode) void fuse_release_nowrite(struct inode *inode) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); __fuse_release_nowrite(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, @@ -1524,7 +1510,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, goto error; } - spin_lock(&fc->lock); + spin_lock(&fi->lock); /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) @@ -1542,10 +1528,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, i_size_write(inode, outarg.attr.size); if (is_truncate) { - /* NOTE: this may release/reacquire fc->lock */ + /* NOTE: this may release/reacquire fi->lock */ __fuse_release_nowrite(inode); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); /* * Only call invalidate_inode_pages2() after removing diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a59c16bd90ac..06096b60f1df 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,8 +19,6 @@ #include <linux/falloc.h> #include <linux/uio.h> -static const struct file_operations fuse_direct_io_file_operations; - static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { @@ -64,9 +62,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); - spin_lock(&fc->lock); - ff->kh = ++fc->khctr; - spin_unlock(&fc->lock); + ff->kh = atomic64_inc_return(&fc->khctr); return ff; } @@ -94,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (ff->fc->no_open && !isdir) { + if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { /* * Drop the release request when client does not * implement 'open' @@ -128,8 +124,9 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, return -ENOMEM; ff->fh = 0; - ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ - if (!fc->no_open || isdir) { + /* Default for no-open */ + ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); + if (isdir ? !fc->no_opendir : !fc->no_open) { struct fuse_open_out outarg; int err; @@ -138,11 +135,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; - } else if (err != -ENOSYS || isdir) { + } else if (err != -ENOSYS) { fuse_file_free(ff); return err; } else { - fc->no_open = 1; + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; } } @@ -159,17 +159,16 @@ EXPORT_SYMBOL_GPL(fuse_do_open); static void fuse_link_write_file(struct file *file) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; /* * file may be written through mmap, so chain it onto the * inodes's write_file list */ - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (list_empty(&ff->write_entry)) list_add(&ff->write_entry, &fi->write_files); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } void fuse_finish_open(struct inode *inode, struct file *file) @@ -177,8 +176,6 @@ void fuse_finish_open(struct inode *inode, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); - if (ff->open_flags & FOPEN_DIRECT_IO) - file->f_op = &fuse_direct_io_file_operations; if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_NONSEEKABLE) @@ -186,10 +183,10 @@ void fuse_finish_open(struct inode *inode, struct file *file) if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_invalidate_attr(inode); if (fc->writeback_cache) file_update_time(file); @@ -224,14 +221,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) return err; } -static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) +static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, + int flags, int opcode) { struct fuse_conn *fc = ff->fc; struct fuse_req *req = ff->reserved_req; struct fuse_release_in *inarg = &req->misc.release.in; + /* Inode is NULL on error path of fuse_create_open() */ + if (likely(fi)) { + spin_lock(&fi->lock); + list_del(&ff->write_entry); + spin_unlock(&fi->lock); + } spin_lock(&fc->lock); - list_del(&ff->write_entry); if (!RB_EMPTY_NODE(&ff->polled_node)) rb_erase(&ff->polled_node, &fc->polled_files); spin_unlock(&fc->lock); @@ -249,11 +252,12 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) void fuse_release_common(struct file *file, bool isdir) { + struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; struct fuse_req *req = ff->reserved_req; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(ff, file->f_flags, opcode); + fuse_prepare_release(fi, ff, file->f_flags, opcode); if (ff->flock) { struct fuse_release_in *inarg = &req->misc.release.in; @@ -295,10 +299,10 @@ static int fuse_release(struct inode *inode, struct file *file) return 0; } -void fuse_sync_release(struct fuse_file *ff, int flags) +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(ff, flags, FUSE_RELEASE); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); /* * iput(NULL) is a no-op and since the refcount is 1 and everything's * synchronous, we are fine with not doing igrab() here" @@ -329,33 +333,39 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } -/* - * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) +static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi, + pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_req *req; - bool found = false; - spin_lock(&fc->lock); list_for_each_entry(req, &fi->writepages, writepages_entry) { pgoff_t curr_index; - BUG_ON(req->inode != inode); + WARN_ON(get_fuse_inode(req->inode) != fi); curr_index = req->misc.write.in.offset >> PAGE_SHIFT; if (idx_from < curr_index + req->num_pages && curr_index <= idx_to) { - found = true; - break; + return req; } } - spin_unlock(&fc->lock); + return NULL; +} + +/* + * Check if any page in a range is under writeback + * + * This is currently done by walking the list of writepage requests + * for the inode, which can be pretty inefficient. + */ +static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, + pgoff_t idx_to) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + bool found; + + spin_lock(&fi->lock); + found = fuse_find_writeback(fi, idx_from, idx_to); + spin_unlock(&fi->lock); return found; } @@ -598,9 +608,9 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - spin_unlock(&fc->lock); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); } io->iocb->ki_complete(io->iocb, res, 0); @@ -675,13 +685,13 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (attr_ver == fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { - fi->attr_version = ++fc->attr_version; + fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); } static void fuse_short_read(struct fuse_req *req, struct inode *inode, @@ -919,7 +929,7 @@ out: return err; } -static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = iocb->ki_filp->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); @@ -996,13 +1006,13 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) struct fuse_inode *fi = get_fuse_inode(inode); bool ret = false; - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); if (pos > inode->i_size) { i_size_write(inode, pos); ret = true; } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return ret; } @@ -1125,9 +1135,6 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, int err = 0; ssize_t res = 0; - if (is_bad_inode(inode)) - return -EIO; - if (inode->i_size < pos + iov_iter_count(ii)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); @@ -1173,7 +1180,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, return res > 0 ? res : err; } -static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -1416,9 +1423,6 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, ssize_t res; struct inode *inode = file_inode(io->iocb->ki_filp); - if (is_bad_inode(inode)) - return -EIO; - res = fuse_direct_io(io, iter, ppos, 0); fuse_invalidate_atime(inode); @@ -1426,10 +1430,21 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, return res; } +static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); + static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); - return __fuse_direct_read(&io, to, &iocb->ki_pos); + ssize_t res; + + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + res = fuse_direct_IO(iocb, to); + } else { + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + + res = __fuse_direct_read(&io, to, &iocb->ki_pos); + } + + return res; } static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1438,14 +1453,17 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - if (is_bad_inode(inode)) - return -EIO; - /* Don't allow parallel writes to the same file */ inode_lock(inode); res = generic_write_checks(iocb, from); - if (res > 0) - res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + if (res > 0) { + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { + res = fuse_direct_IO(iocb, from); + } else { + res = fuse_direct_io(&io, from, &iocb->ki_pos, + FUSE_DIO_WRITE); + } + } fuse_invalidate_attr(inode); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); @@ -1454,6 +1472,34 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) return res; } +static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + + if (is_bad_inode(file_inode(file))) + return -EIO; + + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_cache_read_iter(iocb, to); + else + return fuse_direct_read_iter(iocb, to); +} + +static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + + if (is_bad_inode(file_inode(file))) + return -EIO; + + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_cache_write_iter(iocb, from); + else + return fuse_direct_write_iter(iocb, from); +} + static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { int i; @@ -1481,20 +1527,18 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fi->page_waitq); } -/* Called under fc->lock, may release and reacquire it */ +/* Called under fi->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, loff_t size) -__releases(fc->lock) -__acquires(fc->lock) +__releases(fi->lock) +__acquires(fi->lock) { + struct fuse_req *aux, *next; struct fuse_inode *fi = get_fuse_inode(req->inode); struct fuse_write_in *inarg = &req->misc.write.in; __u64 data_size = req->num_pages * PAGE_SIZE; bool queued; - if (!fc->connected) - goto out_free; - if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { @@ -1505,28 +1549,40 @@ __acquires(fc->lock) } req->in.args[1].size = inarg->size; - fi->writectr++; queued = fuse_request_queue_background(fc, req); - WARN_ON(!queued); + /* Fails on broken connection only */ + if (unlikely(!queued)) + goto out_free; + + fi->writectr++; return; out_free: fuse_writepage_finish(fc, req); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); + + /* After fuse_writepage_finish() aux request list is private */ + for (aux = req->misc.write.next; aux; aux = next) { + next = aux->misc.write.next; + aux->misc.write.next = NULL; + fuse_writepage_free(fc, aux); + fuse_put_request(fc, aux); + } + fuse_writepage_free(fc, req); fuse_put_request(fc, req); - spin_lock(&fc->lock); + spin_lock(&fi->lock); } /* * If fi->writectr is positive (no truncate or fsync going on) send * all queued writepage requests. * - * Called with fc->lock + * Called with fi->lock */ void fuse_flush_writepages(struct inode *inode) -__releases(fc->lock) -__acquires(fc->lock) +__releases(fi->lock) +__acquires(fi->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1546,7 +1602,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) struct fuse_inode *fi = get_fuse_inode(inode); mapping_set_error(inode->i_mapping, req->out.h.error); - spin_lock(&fc->lock); + spin_lock(&fi->lock); while (req->misc.write.next) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_write_in *inarg = &req->misc.write.in; @@ -1583,7 +1639,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) } fi->writectr--; fuse_writepage_finish(fc, req); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_writepage_free(fc, req); } @@ -1592,13 +1648,13 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, { struct fuse_file *ff = NULL; - spin_lock(&fc->lock); + spin_lock(&fi->lock); if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); } - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return ff; } @@ -1669,11 +1725,11 @@ static int fuse_writepage_locked(struct page *page) inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add(&req->writepages_entry, &fi->writepages); list_add_tail(&req->list, &fi->queued_writes); fuse_flush_writepages(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); end_page_writeback(page); @@ -1722,21 +1778,27 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) { struct fuse_req *req = data->req; struct inode *inode = data->inode; - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); int num_pages = req->num_pages; int i; req->ff = fuse_file_get(data->ff); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add_tail(&req->list, &fi->queued_writes); fuse_flush_writepages(inode); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); for (i = 0; i < num_pages; i++) end_page_writeback(data->orig_pages[i]); } +/* + * First recheck under fi->lock if the offending offset is still under + * writeback. If yes, then iterate auxiliary write requests, to see if there's + * one already added for a page at this offset. If there's none, then insert + * this new request onto the auxiliary list, otherwise reuse the existing one by + * copying the new page contents over to the old temporary page. + */ static bool fuse_writepage_in_flight(struct fuse_req *new_req, struct page *page) { @@ -1744,57 +1806,50 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, struct fuse_inode *fi = get_fuse_inode(new_req->inode); struct fuse_req *tmp; struct fuse_req *old_req; - bool found = false; - pgoff_t curr_index; - BUG_ON(new_req->num_pages != 0); + WARN_ON(new_req->num_pages != 0); - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_del(&new_req->writepages_entry); - list_for_each_entry(old_req, &fi->writepages, writepages_entry) { - BUG_ON(old_req->inode != new_req->inode); - curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT; - if (curr_index <= page->index && - page->index < curr_index + old_req->num_pages) { - found = true; - break; - } - } - if (!found) { + old_req = fuse_find_writeback(fi, page->index, page->index); + if (!old_req) { list_add(&new_req->writepages_entry, &fi->writepages); - goto out_unlock; + spin_unlock(&fi->lock); + return false; } new_req->num_pages = 1; - for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) { - BUG_ON(tmp->inode != new_req->inode); + for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) { + pgoff_t curr_index; + + WARN_ON(tmp->inode != new_req->inode); curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; - if (tmp->num_pages == 1 && - curr_index == page->index) { - old_req = tmp; + if (curr_index == page->index) { + WARN_ON(tmp->num_pages != 1); + WARN_ON(!test_bit(FR_PENDING, &tmp->flags)); + swap(tmp->pages[0], new_req->pages[0]); + break; } } - if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) { - struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); + if (!tmp) { + new_req->misc.write.next = old_req->misc.write.next; + old_req->misc.write.next = new_req; + } + + spin_unlock(&fi->lock); - copy_highpage(old_req->pages[0], page); - spin_unlock(&fc->lock); + if (tmp) { + struct backing_dev_info *bdi = inode_to_bdi(new_req->inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); - goto out; - } else { - new_req->misc.write.next = old_req->misc.write.next; - old_req->misc.write.next = new_req; } -out_unlock: - spin_unlock(&fc->lock); -out: - return found; + + return true; } static int fuse_writepages_fill(struct page *page, @@ -1803,6 +1858,7 @@ static int fuse_writepages_fill(struct page *page, struct fuse_fill_wb_data *data = _data; struct fuse_req *req = data->req; struct inode *inode = data->inode; + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); struct page *tmp_page; bool is_writeback; @@ -1873,9 +1929,9 @@ static int fuse_writepages_fill(struct page *page, req->end = fuse_writepage_end; req->inode = inode; - spin_lock(&fc->lock); + spin_lock(&fi->lock); list_add(&req->writepages_entry, &fi->writepages); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); data->req = req; } @@ -1898,12 +1954,12 @@ static int fuse_writepages_fill(struct page *page, data->orig_pages[req->num_pages] = page; /* - * Protected by fc->lock against concurrent access by + * Protected by fi->lock against concurrent access by * fuse_page_is_writeback(). */ - spin_lock(&fc->lock); + spin_lock(&fi->lock); req->num_pages++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); out_unlock: unlock_page(page); @@ -2087,6 +2143,18 @@ static const struct vm_operations_struct fuse_file_vm_ops = { static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct fuse_file *ff = file->private_data; + + if (ff->open_flags & FOPEN_DIRECT_IO) { + /* Can't provide the coherency needed for MAP_SHARED */ + if (vma->vm_flags & VM_MAYSHARE) + return -ENODEV; + + invalidate_inode_pages2(file->f_mapping); + + return generic_file_mmap(file, vma); + } + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) fuse_link_write_file(file); @@ -2095,17 +2163,6 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) -{ - /* Can't provide the coherency needed for MAP_SHARED */ - if (vma->vm_flags & VM_MAYSHARE) - return -ENODEV; - - invalidate_inode_pages2(file->f_mapping); - - return generic_file_mmap(file, vma); -} - static int convert_fuse_file_lock(struct fuse_conn *fc, const struct fuse_file_lock *ffl, struct file_lock *fl) @@ -3114,6 +3171,7 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3121,24 +3179,6 @@ static const struct file_operations fuse_file_operations = { .copy_file_range = fuse_copy_file_range, }; -static const struct file_operations fuse_direct_io_file_operations = { - .llseek = fuse_file_llseek, - .read_iter = fuse_direct_read_iter, - .write_iter = fuse_direct_write_iter, - .mmap = fuse_direct_mmap, - .open = fuse_open, - .flush = fuse_flush, - .release = fuse_release, - .fsync = fuse_fsync, - .lock = fuse_file_lock, - .flock = fuse_file_flock, - .unlocked_ioctl = fuse_file_ioctl, - .compat_ioctl = fuse_file_compat_ioctl, - .poll = fuse_file_poll, - .fallocate = fuse_file_fallocate, - /* no splice_read */ -}; - static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, .writepage = fuse_writepage, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2f2c92e6f8cb..0920c0c032a0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -96,7 +96,7 @@ struct fuse_inode { union { /* Write related fields (regular file only) */ struct { - /* Files usable in writepage. Protected by fc->lock */ + /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; /* Writepages pending on truncate or fsync */ @@ -144,6 +144,9 @@ struct fuse_inode { /** Lock for serializing lookup and readdir for back compatibility*/ struct mutex mutex; + + /** Lock to protect write related fields */ + spinlock_t lock; }; /** FUSE inode state bits */ @@ -163,7 +166,10 @@ struct fuse_file { /** Fuse connection for this file */ struct fuse_conn *fc; - /** Request reserved for flush and release */ + /* + * Request reserved for flush and release. + * Modified under relative fuse_inode::lock. + */ struct fuse_req *reserved_req; /** Kernel file handle guaranteed to be unique */ @@ -538,7 +544,7 @@ struct fuse_conn { struct fuse_iqueue iq; /** The next unique kernel file handle */ - u64 khctr; + atomic64_t khctr; /** rbtree of fuse_files waiting for poll events indexed by ph */ struct rb_root polled_files; @@ -624,6 +630,9 @@ struct fuse_conn { /** Is open/release not implemented by fs? */ unsigned no_open:1; + /** Is opendir/releasedir not implemented by fs? */ + unsigned no_opendir:1; + /** Is fsync not implemented by fs? */ unsigned no_fsync:1; @@ -730,7 +739,7 @@ struct fuse_conn { struct fuse_req *destroy_req; /** Version counter for attribute changes */ - u64 attr_version; + atomic64_t attr_version; /** Called on final put */ void (*release)(struct fuse_conn *); @@ -770,6 +779,11 @@ static inline int invalid_nodeid(u64 nodeid) return !nodeid || nodeid == FUSE_ROOT_ID; } +static inline u64 fuse_get_attr_version(struct fuse_conn *fc) +{ + return atomic64_read(&fc->attr_version); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -817,7 +831,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); -void fuse_sync_release(struct fuse_file *ff, int flags); +void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags); /** * Send RELEASE or RELEASEDIR request @@ -936,7 +950,7 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ -void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); +void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); /** @@ -1000,8 +1014,6 @@ void fuse_flush_writepages(struct inode *inode); void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); -u64 fuse_get_attr_version(struct fuse_conn *fc); - /** * File-system tells the kernel to invalidate cache for the given node id. */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c2d4099429be..ec5d9953dfb6 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -97,6 +97,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); + spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) { kmem_cache_free(fuse_inode_cachep, inode); @@ -163,7 +164,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - fi->attr_version = ++fc->attr_version; + lockdep_assert_held(&fi->lock); + + fi->attr_version = atomic64_inc_return(&fc->attr_version); fi->i_time = attr_valid; WRITE_ONCE(fi->inval_mask, 0); @@ -209,10 +212,10 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, loff_t oldsize; struct timespec64 old_mtime; - spin_lock(&fc->lock); + spin_lock(&fi->lock); if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); return; } @@ -227,7 +230,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, */ if (!is_wb || !S_ISREG(inode->i_mode)) i_size_write(inode, attr->size); - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); if (!is_wb && S_ISREG(inode->i_mode)) { bool inval = false; @@ -322,9 +325,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, } fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); fuse_change_attributes(inode, attr, attr_valid, attr_version); return inode; @@ -376,7 +379,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb), false); + fuse_abort_conn(get_fuse_conn_super(sb)); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -619,12 +622,12 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; - fc->khctr = 0; + atomic64_set(&fc->khctr, 0); fc->polled_files = RB_ROOT; fc->blocked = 0; fc->initialized = 0; fc->connected = 1; - fc->attr_version = 1; + atomic64_set(&fc->attr_version, 1); get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); @@ -969,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | + FUSE_NO_OPENDIR_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1010,7 +1014,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; /* fuse does it's own writeback accounting */ sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; @@ -1242,7 +1246,7 @@ static void fuse_sb_destroy(struct super_block *sb) if (fc) { fuse_send_destroy(fc); - fuse_abort_conn(fc, false); + fuse_abort_conn(fc); fuse_wait_aborted(fc); down_write(&fc->killsb); diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index ab18b78f4755..574d03f8a573 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -213,9 +213,9 @@ retry: } fi = get_fuse_inode(inode); - spin_lock(&fc->lock); + spin_lock(&fi->lock); fi->nlookup++; - spin_unlock(&fc->lock); + spin_unlock(&fi->lock); forget_all_cached_acls(inode); fuse_change_attributes(inode, &o->attr, diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 823a328791c0..302f45101a96 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -120,11 +120,11 @@ struct hpfs_spare_block u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ u8 bad_bitmap: 1; /* bad bitmap */ u8 fast: 1; /* partition was fast formatted */ - u8 old_wrote: 1; /* old version wrote to partion */ - u8 old_wrote_1: 1; /* old version wrote to partion (?) */ + u8 old_wrote: 1; /* old version wrote to partition */ + u8 old_wrote_1: 1; /* old version wrote to partition (?) */ #else - u8 old_wrote_1: 1; /* old version wrote to partion (?) */ - u8 old_wrote: 1; /* old version wrote to partion */ + u8 old_wrote_1: 1; /* old version wrote to partition (?) */ + u8 old_wrote: 1; /* old version wrote to partition */ u8 fast: 1; /* partition was fast formatted */ u8 bad_bitmap: 1; /* bad bitmap */ u8 bad_sector: 1; /* bad sector, corrupted disk (???) */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b0eef008de67..ec32fece5e1e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -27,7 +27,7 @@ #include <linux/backing-dev.h> #include <linux/hugetlb.h> #include <linux/pagevec.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/dnotify.h> @@ -45,11 +45,17 @@ const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; -struct hugetlbfs_config { +enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +struct hugetlbfs_fs_context { struct hstate *hstate; + unsigned long long max_size_opt; + unsigned long long min_size_opt; long max_hpages; long nr_inodes; long min_hpages; + enum hugetlbfs_size_type max_val_type; + enum hugetlbfs_size_type min_val_type; kuid_t uid; kgid_t gid; umode_t mode; @@ -57,22 +63,30 @@ struct hugetlbfs_config { int sysctl_hugetlb_shm_group; -enum { - Opt_size, Opt_nr_inodes, - Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, Opt_min_size, - Opt_err, +enum hugetlb_param { + Opt_gid, + Opt_min_size, + Opt_mode, + Opt_nr_inodes, + Opt_pagesize, + Opt_size, + Opt_uid, }; -static const match_table_t tokens = { - {Opt_size, "size=%s"}, - {Opt_nr_inodes, "nr_inodes=%s"}, - {Opt_mode, "mode=%o"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_pagesize, "pagesize=%s"}, - {Opt_min_size, "min_size=%s"}, - {Opt_err, NULL}, +static const struct fs_parameter_spec hugetlb_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_string("min_size", Opt_min_size), + fsparam_u32 ("mode", Opt_mode), + fsparam_string("nr_inodes", Opt_nr_inodes), + fsparam_string("pagesize", Opt_pagesize), + fsparam_string("size", Opt_size), + fsparam_u32 ("uid", Opt_uid), + {} +}; + +static const struct fs_parameter_description hugetlb_fs_parameters = { + .name = "hugetlbfs", + .specs = hugetlb_param_specs, }; #ifdef CONFIG_NUMA @@ -708,16 +722,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) } static struct inode *hugetlbfs_get_root(struct super_block *sb, - struct hugetlbfs_config *config) + struct hugetlbfs_fs_context *ctx) { struct inode *inode; inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_mode = S_IFDIR | config->mode; - inode->i_uid = config->uid; - inode->i_gid = config->gid; + inode->i_mode = S_IFDIR | ctx->mode; + inode->i_uid = ctx->uid; + inode->i_gid = ctx->gid; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -1093,8 +1107,6 @@ static const struct super_operations hugetlbfs_ops = { .show_options = hugetlbfs_show_options, }; -enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; - /* * Convert size option passed from command line to number of huge pages * in the pool specified by hstate. Size option could be in bytes @@ -1117,170 +1129,151 @@ hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, return size_opt; } -static int -hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) +/* + * Parse one mount parameter. + */ +static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p, *rest; - substring_t args[MAX_OPT_ARGS]; - int option; - unsigned long long max_size_opt = 0, min_size_opt = 0; - enum hugetlbfs_size_type max_val_type = NO_SIZE, min_val_type = NO_SIZE; - - if (!options) + struct hugetlbfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + char *rest; + unsigned long ps; + int opt; + + opt = fs_parse(fc, &hugetlb_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + ctx->uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(ctx->uid)) + goto bad_val; return 0; - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; + case Opt_gid: + ctx->gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(ctx->gid)) + goto bad_val; + return 0; - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - goto bad_val; - pconfig->uid = make_kuid(current_user_ns(), option); - if (!uid_valid(pconfig->uid)) - goto bad_val; - break; + case Opt_mode: + ctx->mode = result.uint_32 & 01777U; + return 0; - case Opt_gid: - if (match_int(&args[0], &option)) - goto bad_val; - pconfig->gid = make_kgid(current_user_ns(), option); - if (!gid_valid(pconfig->gid)) - goto bad_val; - break; + case Opt_size: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->max_size_opt = memparse(param->string, &rest); + ctx->max_val_type = SIZE_STD; + if (*rest == '%') + ctx->max_val_type = SIZE_PERCENT; + return 0; - case Opt_mode: - if (match_octal(&args[0], &option)) - goto bad_val; - pconfig->mode = option & 01777U; - break; + case Opt_nr_inodes: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->nr_inodes = memparse(param->string, &rest); + return 0; - case Opt_size: { - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - max_size_opt = memparse(args[0].from, &rest); - max_val_type = SIZE_STD; - if (*rest == '%') - max_val_type = SIZE_PERCENT; - break; + case Opt_pagesize: + ps = memparse(param->string, &rest); + ctx->hstate = size_to_hstate(ps); + if (!ctx->hstate) { + pr_err("Unsupported page size %lu MB\n", ps >> 20); + return -EINVAL; } + return 0; - case Opt_nr_inodes: - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - pconfig->nr_inodes = memparse(args[0].from, &rest); - break; + case Opt_min_size: + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(param->string[0])) + goto bad_val; + ctx->min_size_opt = memparse(param->string, &rest); + ctx->min_val_type = SIZE_STD; + if (*rest == '%') + ctx->min_val_type = SIZE_PERCENT; + return 0; - case Opt_pagesize: { - unsigned long ps; - ps = memparse(args[0].from, &rest); - pconfig->hstate = size_to_hstate(ps); - if (!pconfig->hstate) { - pr_err("Unsupported page size %lu MB\n", - ps >> 20); - return -EINVAL; - } - break; - } + default: + return -EINVAL; + } - case Opt_min_size: { - /* memparse() will accept a K/M/G without a digit */ - if (!isdigit(*args[0].from)) - goto bad_val; - min_size_opt = memparse(args[0].from, &rest); - min_val_type = SIZE_STD; - if (*rest == '%') - min_val_type = SIZE_PERCENT; - break; - } +bad_val: + return invalf(fc, "hugetlbfs: Bad value '%s' for mount option '%s'\n", + param->string, param->key); +} - default: - pr_err("Bad mount option: \"%s\"\n", p); - return -EINVAL; - break; - } - } +/* + * Validate the parsed options. + */ +static int hugetlbfs_validate(struct fs_context *fc) +{ + struct hugetlbfs_fs_context *ctx = fc->fs_private; /* * Use huge page pool size (in hstate) to convert the size * options to number of huge pages. If NO_SIZE, -1 is returned. */ - pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, - max_size_opt, max_val_type); - pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, - min_size_opt, min_val_type); + ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, + ctx->max_size_opt, + ctx->max_val_type); + ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, + ctx->min_size_opt, + ctx->min_val_type); /* * If max_size was specified, then min_size must be smaller */ - if (max_val_type > NO_SIZE && - pconfig->min_hpages > pconfig->max_hpages) { - pr_err("minimum size can not be greater than maximum size\n"); + if (ctx->max_val_type > NO_SIZE && + ctx->min_hpages > ctx->max_hpages) { + pr_err("Minimum size can not be greater than maximum size\n"); return -EINVAL; } return 0; - -bad_val: - pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); - return -EINVAL; } static int -hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) +hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) { - int ret; - struct hugetlbfs_config config; + struct hugetlbfs_fs_context *ctx = fc->fs_private; struct hugetlbfs_sb_info *sbinfo; - config.max_hpages = -1; /* No limit on size by default */ - config.nr_inodes = -1; /* No limit on number of inodes by default */ - config.uid = current_fsuid(); - config.gid = current_fsgid(); - config.mode = 0755; - config.hstate = &default_hstate; - config.min_hpages = -1; /* No default minimum size */ - ret = hugetlbfs_parse_options(data, &config); - if (ret) - return ret; - sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); if (!sbinfo) return -ENOMEM; sb->s_fs_info = sbinfo; - sbinfo->hstate = config.hstate; spin_lock_init(&sbinfo->stat_lock); - sbinfo->max_inodes = config.nr_inodes; - sbinfo->free_inodes = config.nr_inodes; - sbinfo->spool = NULL; - sbinfo->uid = config.uid; - sbinfo->gid = config.gid; - sbinfo->mode = config.mode; + sbinfo->hstate = ctx->hstate; + sbinfo->max_inodes = ctx->nr_inodes; + sbinfo->free_inodes = ctx->nr_inodes; + sbinfo->spool = NULL; + sbinfo->uid = ctx->uid; + sbinfo->gid = ctx->gid; + sbinfo->mode = ctx->mode; /* * Allocate and initialize subpool if maximum or minimum size is * specified. Any needed reservations (for minimim size) are taken * taken when the subpool is created. */ - if (config.max_hpages != -1 || config.min_hpages != -1) { - sbinfo->spool = hugepage_new_subpool(config.hstate, - config.max_hpages, - config.min_hpages); + if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(ctx->hstate, + ctx->max_hpages, + ctx->min_hpages); if (!sbinfo->spool) goto out_free; } sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_blocksize = huge_page_size(config.hstate); - sb->s_blocksize_bits = huge_page_shift(config.hstate); + sb->s_blocksize = huge_page_size(ctx->hstate); + sb->s_blocksize_bits = huge_page_shift(ctx->hstate); sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; - sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); + sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); if (!sb->s_root) goto out_free; return 0; @@ -1290,16 +1283,52 @@ out_free: return -ENOMEM; } -static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hugetlbfs_get_tree(struct fs_context *fc) +{ + int err = hugetlbfs_validate(fc); + if (err) + return err; + return vfs_get_super(fc, vfs_get_independent_super, hugetlbfs_fill_super); +} + +static void hugetlbfs_fs_context_free(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations hugetlbfs_fs_context_ops = { + .free = hugetlbfs_fs_context_free, + .parse_param = hugetlbfs_parse_param, + .get_tree = hugetlbfs_get_tree, +}; + +static int hugetlbfs_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); + struct hugetlbfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_hpages = -1; /* No limit on size by default */ + ctx->nr_inodes = -1; /* No limit on number of inodes by default */ + ctx->uid = current_fsuid(); + ctx->gid = current_fsgid(); + ctx->mode = 0755; + ctx->hstate = &default_hstate; + ctx->min_hpages = -1; /* No default minimum size */ + ctx->max_val_type = NO_SIZE; + ctx->min_val_type = NO_SIZE; + fc->fs_private = ctx; + fc->ops = &hugetlbfs_fs_context_ops; + return 0; } static struct file_system_type hugetlbfs_fs_type = { - .name = "hugetlbfs", - .mount = hugetlbfs_mount, - .kill_sb = kill_litter_super, + .name = "hugetlbfs", + .init_fs_context = hugetlbfs_init_fs_context, + .parameters = &hugetlb_fs_parameters, + .kill_sb = kill_litter_super, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1384,8 +1413,29 @@ out: return file; } +static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) +{ + struct fs_context *fc; + struct vfsmount *mnt; + + fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) { + mnt = ERR_CAST(fc); + } else { + struct hugetlbfs_fs_context *ctx = fc->fs_private; + ctx->hstate = h; + mnt = fc_mount(fc); + put_fs_context(fc); + } + if (IS_ERR(mnt)) + pr_err("Cannot mount internal hugetlbfs for page size %uK", + 1U << (h->order + PAGE_SHIFT - 10)); + return mnt; +} + static int __init init_hugetlbfs_fs(void) { + struct vfsmount *mnt; struct hstate *h; int error; int i; @@ -1408,24 +1458,16 @@ static int __init init_hugetlbfs_fs(void) i = 0; for_each_hstate(h) { - char buf[50]; - unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); - - snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); - hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, - buf); - - if (IS_ERR(hugetlbfs_vfsmount[i])) { - pr_err("Cannot mount internal hugetlbfs for " - "page size %uK", ps_kb); - error = PTR_ERR(hugetlbfs_vfsmount[i]); - hugetlbfs_vfsmount[i] = NULL; + mnt = mount_one_hugetlbfs(h); + if (IS_ERR(mnt) && i == 0) { + error = PTR_ERR(mnt); + goto out; } + hugetlbfs_vfsmount[i] = mnt; i++; } - /* Non default hstates are optional */ - if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) - return 0; + + return 0; out: kmem_cache_destroy(hugetlbfs_inode_cachep); diff --git a/fs/internal.h b/fs/internal.h index d410186bc369..6a8b71643af4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -17,6 +17,7 @@ struct linux_binprm; struct path; struct mount; struct shrink_control; +struct fs_context; /* * block_dev.c @@ -52,8 +53,16 @@ int __generic_write_end(struct inode *inode, loff_t pos, unsigned copied, extern void __init chrdev_init(void); /* + * fs_context.c + */ +extern int parse_monolithic_mount_data(struct fs_context *, void *); +extern void fc_drop_locked(struct fs_context *); + +/* * namei.c */ +extern int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); @@ -99,10 +108,8 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); /* * super.c */ -extern int do_remount_sb(struct super_block *, int, void *, int); +extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); -extern struct dentry *mount_fs(struct file_system_type *, - int, const char *, void *); extern struct super_block *user_get_super(dev_t); /* diff --git a/fs/io_uring.c b/fs/io_uring.c index c592a0933b0d..6aaa30580a2b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -917,7 +917,7 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) /* Use 8x RA size as a decent limiter for both reads/writes */ max_pages = filp->f_ra.ra_pages; if (!max_pages) - max_pages = VM_MAX_READAHEAD >> (PAGE_SHIFT - 10); + max_pages = VM_READAHEAD_PAGES; max_pages *= 8; /* If max pages are exceeded, reset the state */ diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 26f8d7e46462..02e0b79753e7 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -113,7 +113,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) nblocks = jbd2_space_needed(journal); while (jbd2_log_space_left(journal) < nblocks) { write_unlock(&journal->j_state_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * Test again, another process may have checkpointed while we @@ -276,9 +276,22 @@ restart: "JBD2: %s: Waiting for Godot: block %llu\n", journal->j_devname, (unsigned long long) bh->b_blocknr); + if (batch_count) + __flush_batch(journal, &batch_count); jbd2_log_start_commit(journal, tid); + /* + * jbd2_journal_commit_transaction() may want + * to take the checkpoint_mutex if JBD2_FLUSHED + * is set, jbd2_update_log_tail() called by + * jbd2_journal_commit_transaction() may also take + * checkpoint_mutex. So we need to temporarily + * drop it. + */ + mutex_unlock(&journal->j_checkpoint_mutex); jbd2_log_wait_commit(journal, tid); - goto retry; + mutex_lock_io(&journal->j_checkpoint_mutex); + spin_lock(&journal->j_list_lock); + goto restart; } if (!buffer_dirty(bh)) { if (unlikely(buffer_write_io_error(bh)) && !result) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2eb55c3361a8..efd0ce9489ae 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -694,9 +694,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) the last tag we set up. */ tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG); - - jbd2_descriptor_block_csum_set(journal, descriptor); start_journal_io: + if (descriptor) + jbd2_descriptor_block_csum_set(journal, + descriptor); + for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8ef6b6daaa7a..382c030cc78b 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -142,22 +142,6 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) return cpu_to_be32(csum); } -static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3(j)) - return 1; - - return sb->s_checksum == jbd2_superblock_csum(j, sb); -} - -static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) -{ - if (!jbd2_journal_has_csum_v2or3(j)) - return; - - sb->s_checksum = jbd2_superblock_csum(j, sb); -} - /* * Helper function used to manage commit timeouts */ @@ -1356,6 +1340,10 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } +/* + * This function expects that the caller will have locked the journal + * buffer head, and will return with it unlocked + */ static int jbd2_write_superblock(journal_t *journal, int write_flags) { struct buffer_head *bh = journal->j_sb_buffer; @@ -1365,7 +1353,6 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); - lock_buffer(bh); if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal @@ -1381,7 +1368,8 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags) clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } - jbd2_superblock_csum_set(journal, sb); + if (jbd2_journal_has_csum_v2or3(journal)) + sb->s_checksum = jbd2_superblock_csum(journal, sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; ret = submit_bh(REQ_OP_WRITE, write_flags, bh); @@ -1424,6 +1412,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); + lock_buffer(journal->j_sb_buffer); sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); @@ -1454,18 +1443,17 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) journal_superblock_t *sb = journal->j_superblock; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - read_lock(&journal->j_state_lock); - /* Is it already empty? */ - if (sb->s_start == 0) { - read_unlock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); + if (sb->s_start == 0) { /* Is it already empty? */ + unlock_buffer(journal->j_sb_buffer); return; } + jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(0); - read_unlock(&journal->j_state_lock); jbd2_write_superblock(journal, write_op); @@ -1488,9 +1476,8 @@ void jbd2_journal_update_sb_errno(journal_t *journal) journal_superblock_t *sb = journal->j_superblock; int errcode; - read_lock(&journal->j_state_lock); + lock_buffer(journal->j_sb_buffer); errcode = journal->j_errno; - read_unlock(&journal->j_state_lock); if (errcode == -ESHUTDOWN) errcode = 0; jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); @@ -1595,17 +1582,18 @@ static int journal_get_superblock(journal_t *journal) } } - /* Check superblock checksum */ - if (!jbd2_superblock_csum_verify(journal, sb)) { - printk(KERN_ERR "JBD2: journal checksum error\n"); - err = -EFSBADCRC; - goto out; - } + if (jbd2_journal_has_csum_v2or3(journal)) { + /* Check superblock checksum */ + if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + printk(KERN_ERR "JBD2: journal checksum error\n"); + err = -EFSBADCRC; + goto out; + } - /* Precompute checksum seed for all metadata */ - if (jbd2_journal_has_csum_v2or3(journal)) + /* Precompute checksum seed for all metadata */ journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, sizeof(sb->s_uuid)); + } set_buffer_verified(bh); @@ -1894,28 +1882,27 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb = journal->j_superblock; + /* Load the checksum driver if necessary */ + if ((journal->j_chksum_driver == NULL) && + INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { + journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(journal->j_chksum_driver)) { + printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); + journal->j_chksum_driver = NULL; + return 0; + } + /* Precompute checksum seed for all metadata */ + journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + sizeof(sb->s_uuid)); + } + + lock_buffer(journal->j_sb_buffer); + /* If enabling v3 checksums, update superblock */ if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) { sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - - /* Load the checksum driver */ - if (journal->j_chksum_driver == NULL) { - journal->j_chksum_driver = crypto_alloc_shash("crc32c", - 0, 0); - if (IS_ERR(journal->j_chksum_driver)) { - printk(KERN_ERR "JBD2: Cannot load crc32c " - "driver.\n"); - journal->j_chksum_driver = NULL; - return 0; - } - - /* Precompute checksum seed for all metadata */ - journal->j_csum_seed = jbd2_chksum(journal, ~0, - sb->s_uuid, - sizeof(sb->s_uuid)); - } } /* If enabling v1 checksums, downgrade superblock */ @@ -1927,6 +1914,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat, sb->s_feature_compat |= cpu_to_be32(compat); sb->s_feature_ro_compat |= cpu_to_be32(ro); sb->s_feature_incompat |= cpu_to_be32(incompat); + unlock_buffer(journal->j_sb_buffer); return 1; #undef COMPAT_FEATURE_ON @@ -2067,7 +2055,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) err = jbd2_journal_skip_recovery(journal); if (write) { /* Lock to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cc35537232f2..f940d31c2adc 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -63,7 +63,7 @@ void jbd2_journal_free_transaction(transaction_t *transaction) /* * jbd2_get_transaction: obtain a new transaction_t object. * - * Simply allocate and initialise a new transaction. Create it in + * Simply initialise a new transaction. Initialize it in * RUNNING state and add it to the current journal (which should not * have an existing running transaction: we only make a new transaction * once we have started to commit the old one). @@ -75,8 +75,8 @@ void jbd2_journal_free_transaction(transaction_t *transaction) * */ -static transaction_t * -jbd2_get_transaction(journal_t *journal, transaction_t *transaction) +static void jbd2_get_transaction(journal_t *journal, + transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; @@ -100,8 +100,6 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_max_wait = 0; transaction->t_start = jiffies; transaction->t_requested = 0; - - return transaction; } /* @@ -1252,11 +1250,12 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; char *committed_data = NULL; - JBUFFER_TRACE(jh, "entry"); if (jbd2_write_access_granted(handle, bh, true)) return 0; jh = jbd2_journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1367,15 +1366,17 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) if (is_handle_aborted(handle)) return -EROFS; - if (!buffer_jbd(bh)) { - ret = -EUCLEAN; - goto out; - } + if (!buffer_jbd(bh)) + return -EUCLEAN; + /* * We don't grab jh reference here since the buffer must be part * of the running transaction. */ jh = bh2jh(bh); + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + /* * This and the following assertions are unreliable since we may see jh * in inconsistent state unless we grab bh_state lock. But this is @@ -1409,9 +1410,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } journal = transaction->t_journal; - jbd_debug(5, "journal_head %p\n", jh); - JBUFFER_TRACE(jh, "entry"); - jbd_lock_bh_state(bh); if (jh->b_modified == 0) { @@ -1597,9 +1595,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) __jbd2_journal_unfile_buffer(jh); if (!buffer_jbd(bh)) { spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - __bforget(bh); - goto drop; + goto not_jbd; } } spin_unlock(&journal->j_list_lock); @@ -1609,14 +1605,21 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) /* However, if the buffer is still owned by a prior * (committing) transaction, we can't drop it yet... */ JBUFFER_TRACE(jh, "belongs to older transaction"); - /* ... but we CAN drop it from the new transaction if we - * have also modified it since the original commit. */ + /* ... but we CAN drop it from the new transaction through + * marking the buffer as freed and set j_next_transaction to + * the new transaction, so that not only the commit code + * knows it should clear dirty bits when it is done with the + * buffer, but also the buffer can be checkpointed only + * after the new transaction commits. */ - if (jh->b_next_transaction) { - J_ASSERT(jh->b_next_transaction == transaction); + set_buffer_freed(bh); + + if (!jh->b_next_transaction) { spin_lock(&journal->j_list_lock); - jh->b_next_transaction = NULL; + jh->b_next_transaction = transaction; spin_unlock(&journal->j_list_lock); + } else { + J_ASSERT(jh->b_next_transaction == transaction); /* * only drop a reference if this transaction modified @@ -1625,9 +1628,40 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (was_modified) drop_reserve = 1; } + } else { + /* + * Finally, if the buffer is not belongs to any + * transaction, we can just drop it now if it has no + * checkpoint. + */ + spin_lock(&journal->j_list_lock); + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "belongs to none transaction"); + spin_unlock(&journal->j_list_lock); + goto not_jbd; + } + + /* + * Otherwise, if the buffer has been written to disk, + * it is safe to remove the checkpoint and drop it. + */ + if (!buffer_dirty(bh)) { + __jbd2_journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + goto not_jbd; + } + + /* + * The buffer is still not written to disk, we should + * attach this buffer to current transaction so that the + * buffer can be checkpointed only after the current + * transaction commits. + */ + clear_buffer_dirty(bh); + __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); + spin_unlock(&journal->j_list_lock); } -not_jbd: jbd_unlock_bh_state(bh); __brelse(bh); drop: @@ -1636,6 +1670,11 @@ drop: handle->h_buffer_credits++; } return err; + +not_jbd: + jbd_unlock_bh_state(bh); + __bforget(bh); + goto drop; } /** diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index dba810cd83b1..0b7d197a904c 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -17,6 +17,7 @@ #include <linux/xattr.h> #include <linux/kernfs.h> +#include <linux/fs_context.h> struct kernfs_iattrs { struct iattr ia_iattr; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index f3ac352699cf..9a4646eecb71 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -22,16 +22,6 @@ struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; -static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct kernfs_root *root = kernfs_info(sb)->root; - struct kernfs_syscall_ops *scops = root->syscall_ops; - - if (scops && scops->remount_fs) - return scops->remount_fs(root, flags, data); - return 0; -} - static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); @@ -60,7 +50,6 @@ const struct super_operations kernfs_sops = { .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, - .remount_fs = kernfs_sop_remount_fs, .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, }; @@ -222,7 +211,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, } while (true); } -static int kernfs_fill_super(struct super_block *sb, unsigned long magic) +static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *kfc) { struct kernfs_super_info *info = kernfs_info(sb); struct inode *inode; @@ -233,7 +222,7 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; - sb->s_magic = magic; + sb->s_magic = kfc->magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) @@ -263,21 +252,20 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) return 0; } -static int kernfs_test_super(struct super_block *sb, void *data) +static int kernfs_test_super(struct super_block *sb, struct fs_context *fc) { struct kernfs_super_info *sb_info = kernfs_info(sb); - struct kernfs_super_info *info = data; + struct kernfs_super_info *info = fc->s_fs_info; return sb_info->root == info->root && sb_info->ns == info->ns; } -static int kernfs_set_super(struct super_block *sb, void *data) +static int kernfs_set_super(struct super_block *sb, struct fs_context *fc) { - int error; - error = set_anon_super(sb, data); - if (!error) - sb->s_fs_info = data; - return error; + struct kernfs_fs_context *kfc = fc->fs_private; + + kfc->ns_tag = NULL; + return set_anon_super_fc(sb, fc); } /** @@ -294,63 +282,60 @@ const void *kernfs_super_ns(struct super_block *sb) } /** - * kernfs_mount_ns - kernfs mount helper - * @fs_type: file_system_type of the fs being mounted - * @flags: mount flags specified for the mount - * @root: kernfs_root of the hierarchy being mounted - * @magic: file system specific magic number - * @new_sb_created: tell the caller if we allocated a new superblock - * @ns: optional namespace tag of the mount + * kernfs_get_tree - kernfs filesystem access/retrieval helper + * @fc: The filesystem context. * - * This is to be called from each kernfs user's file_system_type->mount() - * implementation, which should pass through the specified @fs_type and - * @flags, and specify the hierarchy and namespace tag to mount via @root - * and @ns, respectively. - * - * The return value can be passed to the vfs layer verbatim. + * This is to be called from each kernfs user's fs_context->ops->get_tree() + * implementation, which should set the specified ->@fs_type and ->@flags, and + * specify the hierarchy and namespace tag to mount via ->@root and ->@ns, + * respectively. */ -struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, - struct kernfs_root *root, unsigned long magic, - bool *new_sb_created, const void *ns) +int kernfs_get_tree(struct fs_context *fc) { + struct kernfs_fs_context *kfc = fc->fs_private; struct super_block *sb; struct kernfs_super_info *info; int error; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - info->root = root; - info->ns = ns; + info->root = kfc->root; + info->ns = kfc->ns_tag; INIT_LIST_HEAD(&info->node); - sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags, - &init_user_ns, info); - if (IS_ERR(sb) || sb->s_fs_info != info) - kfree(info); + fc->s_fs_info = info; + sb = sget_fc(fc, kernfs_test_super, kernfs_set_super); if (IS_ERR(sb)) - return ERR_CAST(sb); - - if (new_sb_created) - *new_sb_created = !sb->s_root; + return PTR_ERR(sb); if (!sb->s_root) { struct kernfs_super_info *info = kernfs_info(sb); - error = kernfs_fill_super(sb, magic); + kfc->new_sb_created = true; + + error = kernfs_fill_super(sb, kfc); if (error) { deactivate_locked_super(sb); - return ERR_PTR(error); + return error; } sb->s_flags |= SB_ACTIVE; mutex_lock(&kernfs_mutex); - list_add(&info->node, &root->supers); + list_add(&info->node, &info->root->supers); mutex_unlock(&kernfs_mutex); } - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; +} + +void kernfs_free_fs_context(struct fs_context *fc) +{ + /* Note that we don't deal with kfc->ns_tag here. */ + kfree(fc->s_fs_info); + fc->s_fs_info = NULL; } /** @@ -377,36 +362,6 @@ void kernfs_kill_sb(struct super_block *sb) kfree(info); } -/** - * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root - * @kernfs_root: the kernfs_root in question - * @ns: the namespace tag - * - * Pin the superblock so the superblock won't be destroyed in subsequent - * operations. This can be used to block ->kill_sb() which may be useful - * for kernfs users which dynamically manage superblocks. - * - * Returns NULL if there's no superblock associated to this kernfs_root, or - * -EINVAL if the superblock is being freed. - */ -struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) -{ - struct kernfs_super_info *info; - struct super_block *sb = NULL; - - mutex_lock(&kernfs_mutex); - list_for_each_entry(info, &root->supers, node) { - if (info->ns == ns) { - sb = info->sb; - if (!atomic_inc_not_zero(&info->sb->s_active)) - sb = ERR_PTR(-EINVAL); - break; - } - } - mutex_unlock(&kernfs_mutex); - return sb; -} - void __init kernfs_init(void) { diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 214a2fa1f1e3..7df6324ccb8a 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -75,17 +75,6 @@ static void nlm4_compute_offsets(const struct nlm_lock *lock, } /* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("lockd: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NLMv4 basic data types * * Basic NLMv4 data types are defined in Appendix II, section 6.1.4 @@ -176,7 +165,6 @@ out_size: dprintk("NFS: returned cookie was too long: %u\n", length); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -236,7 +224,6 @@ out_bad_xdr: __func__, be32_to_cpup(p)); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -309,7 +296,6 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result) out: return error; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 747b9c8c940a..4df62f635529 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -71,17 +71,6 @@ static void nlm_compute_offsets(const struct nlm_lock *lock, } /* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("lockd: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NLMv3 basic data types * * Basic NLMv3 data types are not defined in an IETF standards @@ -173,7 +162,6 @@ out_size: dprintk("NFS: returned cookie was too long: %u\n", length); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -231,7 +219,6 @@ out_enum: __func__, be32_to_cpup(p)); return -EIO; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -303,7 +290,6 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result) out: return error; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } diff --git a/fs/mount.h b/fs/mount.h index f39bc9da4d73..6250de544760 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -146,3 +146,8 @@ static inline bool is_local_mountpoint(struct dentry *dentry) return __is_local_mountpoint(dentry); } + +static inline bool is_anon_ns(struct mnt_namespace *ns) +{ + return ns->seq == 0; +} diff --git a/fs/namei.c b/fs/namei.c index 0a8c5c27f90e..dede0147b3f6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2331,8 +2331,8 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path return err; } -static int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root) +int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root) { int retval; struct nameidata nd; @@ -3460,6 +3460,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } + ima_post_create_tmpfile(inode); return child; out_err: diff --git a/fs/namespace.c b/fs/namespace.c index 98a8c182af4f..c9cab307fa77 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -27,6 +27,7 @@ #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> +#include <linux/fs_context.h> #include "pnode.h" #include "internal.h" @@ -940,38 +941,81 @@ static struct mount *skip_mnt_tree(struct mount *p) return p; } -struct vfsmount * -vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) +/** + * vfs_create_mount - Create a mount for a configured superblock + * @fc: The configuration context with the superblock attached + * + * Create a mount to an already configured superblock. If necessary, the + * caller should invoke vfs_get_tree() before calling this. + * + * Note that this does not attach the mount to anything. + */ +struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; - struct dentry *root; - if (!type) - return ERR_PTR(-ENODEV); + if (!fc->root) + return ERR_PTR(-EINVAL); - mnt = alloc_vfsmnt(name); + mnt = alloc_vfsmnt(fc->source ?: "none"); if (!mnt) return ERR_PTR(-ENOMEM); - if (flags & SB_KERNMOUNT) + if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - root = mount_fs(type, flags, name, data); - if (IS_ERR(root)) { - mnt_free_id(mnt); - free_vfsmnt(mnt); - return ERR_CAST(root); - } + atomic_inc(&fc->root->d_sb->s_active); + mnt->mnt.mnt_sb = fc->root->d_sb; + mnt->mnt.mnt_root = dget(fc->root); + mnt->mnt_mountpoint = mnt->mnt.mnt_root; + mnt->mnt_parent = mnt; - mnt->mnt.mnt_root = root; - mnt->mnt.mnt_sb = root->d_sb; - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts); + list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); return &mnt->mnt; } +EXPORT_SYMBOL(vfs_create_mount); + +struct vfsmount *fc_mount(struct fs_context *fc) +{ + int err = vfs_get_tree(fc); + if (!err) { + up_write(&fc->root->d_sb->s_umount); + return vfs_create_mount(fc); + } + return ERR_PTR(err); +} +EXPORT_SYMBOL(fc_mount); + +struct vfsmount *vfs_kern_mount(struct file_system_type *type, + int flags, const char *name, + void *data) +{ + struct fs_context *fc; + struct vfsmount *mnt; + int ret = 0; + + if (!type) + return ERR_PTR(-EINVAL); + + fc = fs_context_for_mount(type, flags); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + if (name) + ret = vfs_parse_fs_string(fc, "source", + name, strlen(name)); + if (!ret) + ret = parse_monolithic_mount_data(fc, data); + if (!ret) + mnt = fc_mount(fc); + else + mnt = ERR_PTR(ret); + + put_fs_context(fc); + return mnt; +} EXPORT_SYMBOL_GPL(vfs_kern_mount); struct vfsmount * @@ -1013,27 +1057,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); - /* Don't allow unprivileged users to change mount flags */ - if (flag & CL_UNPRIVILEGED) { - mnt->mnt.mnt_flags |= MNT_LOCK_ATIME; - - if (mnt->mnt.mnt_flags & MNT_READONLY) - mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; - - if (mnt->mnt.mnt_flags & MNT_NODEV) - mnt->mnt.mnt_flags |= MNT_LOCK_NODEV; - - if (mnt->mnt.mnt_flags & MNT_NOSUID) - mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID; - - if (mnt->mnt.mnt_flags & MNT_NOEXEC) - mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC; - } - - /* Don't allow unprivileged users to reveal what is under a mount */ - if ((flag & CL_UNPRIVILEGED) && - (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire))) - mnt->mnt.mnt_flags |= MNT_LOCKED; atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; @@ -1464,6 +1487,29 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) static void shrink_submounts(struct mount *mnt); +static int do_umount_root(struct super_block *sb) +{ + int ret = 0; + + down_write(&sb->s_umount); + if (!sb_rdonly(sb)) { + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, + SB_RDONLY); + if (IS_ERR(fc)) { + ret = PTR_ERR(fc); + } else { + ret = parse_monolithic_mount_data(fc, NULL); + if (!ret) + ret = reconfigure_super(fc); + put_fs_context(fc); + } + } + up_write(&sb->s_umount); + return ret; +} + static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; @@ -1529,11 +1575,7 @@ static int do_umount(struct mount *mnt, int flags) */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; - down_write(&sb->s_umount); - if (!sb_rdonly(sb)) - retval = do_remount_sb(sb, SB_RDONLY, NULL, 0); - up_write(&sb->s_umount); - return retval; + return do_umount_root(sb); } namespace_lock(); @@ -1839,6 +1881,33 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, return 0; } +static void lock_mnt_tree(struct mount *mnt) +{ + struct mount *p; + + for (p = mnt; p; p = next_mnt(p, mnt)) { + int flags = p->mnt.mnt_flags; + /* Don't allow unprivileged users to change mount flags */ + flags |= MNT_LOCK_ATIME; + + if (flags & MNT_READONLY) + flags |= MNT_LOCK_READONLY; + + if (flags & MNT_NODEV) + flags |= MNT_LOCK_NODEV; + + if (flags & MNT_NOSUID) + flags |= MNT_LOCK_NOSUID; + + if (flags & MNT_NOEXEC) + flags |= MNT_LOCK_NOEXEC; + /* Don't allow unprivileged users to reveal what is under a mount */ + if (list_empty(&p->mnt_expire)) + flags |= MNT_LOCKED; + p->mnt.mnt_flags = flags; + } +} + static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; @@ -1956,6 +2025,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, struct mountpoint *dest_mp, struct path *parent_path) { + struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mountpoint *smp; @@ -2006,6 +2076,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, child->mnt_mountpoint); if (q) mnt_change_mountpoint(child, smp, q); + /* Notice when we are propagating across user namespaces */ + if (child->mnt_parent->mnt_ns->user_ns != user_ns) + lock_mnt_tree(child); commit_tree(child); } put_mountpoint(smp); @@ -2313,7 +2386,7 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); - void *sec_opts = NULL; + struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; @@ -2324,24 +2397,22 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; - if (data && !(sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)) { - err = security_sb_eat_lsm_opts(data, &sec_opts); - if (err) - return err; - } - err = security_sb_remount(sb, sec_opts); - security_free_mnt_opts(&sec_opts); - if (err) - return err; + fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); - down_write(&sb->s_umount); - err = -EPERM; - if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { - err = do_remount_sb(sb, sb_flags, data, 0); - if (!err) - set_mount_attributes(mnt, mnt_flags); + err = parse_monolithic_mount_data(fc, data); + if (!err) { + down_write(&sb->s_umount); + err = -EPERM; + if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + err = reconfigure_super(fc); + if (!err) + set_mount_attributes(mnt, mnt_flags); + } + up_write(&sb->s_umount); } - up_write(&sb->s_umount); + put_fs_context(fc); return err; } @@ -2425,29 +2496,6 @@ out: return err; } -static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) -{ - int err; - const char *subtype = strchr(fstype, '.'); - if (subtype) { - subtype++; - err = -EINVAL; - if (!subtype[0]) - goto err; - } else - subtype = ""; - - mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); - err = -ENOMEM; - if (!mnt->mnt_sb->s_subtype) - goto err; - return mnt; - - err: - mntput(mnt); - return ERR_PTR(err); -} - /* * add a mount into a namespace's mount tree */ @@ -2492,7 +2540,39 @@ unlock: return err; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); + +/* + * Create a new mount using a superblock configuration and request it + * be added to the namespace tree. + */ +static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, + unsigned int mnt_flags) +{ + struct vfsmount *mnt; + struct super_block *sb = fc->root->d_sb; + int error; + + error = security_sb_kern_mount(sb); + if (!error && mount_too_revealing(sb, &mnt_flags)) + error = -EPERM; + + if (unlikely(error)) { + fc_drop_locked(fc); + return error; + } + + up_write(&sb->s_umount); + + mnt = vfs_create_mount(fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + error = do_add_mount(real_mount(mnt), mountpoint, mnt_flags); + if (error < 0) + mntput(mnt); + return error; +} /* * create a new mount for userspace and request it to be added into the @@ -2502,8 +2582,9 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; - struct vfsmount *mnt; - int err; + struct fs_context *fc; + const char *subtype = NULL; + int err = 0; if (!fstype) return -EINVAL; @@ -2512,23 +2593,37 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, if (!type) return -ENODEV; - mnt = vfs_kern_mount(type, sb_flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); + if (type->fs_flags & FS_HAS_SUBTYPE) { + subtype = strchr(fstype, '.'); + if (subtype) { + subtype++; + if (!*subtype) { + put_filesystem(type); + return -EINVAL; + } + } else { + subtype = ""; + } + } + fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - if (mount_too_revealing(mnt, &mnt_flags)) { - mntput(mnt); - return -EPERM; - } + if (IS_ERR(fc)) + return PTR_ERR(fc); + + if (subtype) + err = vfs_parse_fs_string(fc, "subtype", + subtype, strlen(subtype)); + if (!err && name) + err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + if (!err) + err = parse_monolithic_mount_data(fc, data); + if (!err) + err = vfs_get_tree(fc); + if (!err) + err = do_new_mount_fc(fc, path, mnt_flags); - err = do_add_mount(real_mount(mnt), path, mnt_flags); - if (err) - mntput(mnt); + put_fs_context(fc); return err; } @@ -2863,7 +2958,8 @@ static void dec_mnt_namespaces(struct ucounts *ucounts) static void free_mnt_ns(struct mnt_namespace *ns) { - ns_free_inum(&ns->ns); + if (!is_anon_ns(ns)) + ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); @@ -2878,7 +2974,7 @@ static void free_mnt_ns(struct mnt_namespace *ns) */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); -static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) +static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; @@ -2888,28 +2984,27 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns) if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } - ret = ns_alloc_inum(&new_ns->ns); - if (ret) { - kfree(new_ns); - dec_mnt_namespaces(ucounts); - return ERR_PTR(ret); + if (!anon) { + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + dec_mnt_namespaces(ucounts); + return ERR_PTR(ret); + } } new_ns->ns.ops = &mntns_operations; - new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); + if (!anon) + new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); - new_ns->root = NULL; INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); - new_ns->event = 0; new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; - new_ns->mounts = 0; - new_ns->pending_mounts = 0; return new_ns; } @@ -2933,7 +3028,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, old = ns->root; - new_ns = alloc_mnt_ns(user_ns); + new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; @@ -2941,13 +3036,18 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED; + copy_flags |= CL_SHARED_TO_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } + if (user_ns != ns->user_ns) { + lock_mount_hash(); + lock_mnt_tree(new); + unlock_mount_hash(); + } new_ns->root = new; list_add_tail(&new_ns->list, &new->mnt_list); @@ -2988,37 +3088,25 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, return new_ns; } -/** - * create_mnt_ns - creates a private namespace and adds a root filesystem - * @mnt: pointer to the new root filesystem mountpoint - */ -static struct mnt_namespace *create_mnt_ns(struct vfsmount *m) -{ - struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns); - if (!IS_ERR(new_ns)) { - struct mount *mnt = real_mount(m); - mnt->mnt_ns = new_ns; - new_ns->root = mnt; - new_ns->mounts++; - list_add(&mnt->mnt_list, &new_ns->list); - } else { - mntput(m); - } - return new_ns; -} - -struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +struct dentry *mount_subtree(struct vfsmount *m, const char *name) { + struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; - ns = create_mnt_ns(mnt); - if (IS_ERR(ns)) + ns = alloc_mnt_ns(&init_user_ns, true); + if (IS_ERR(ns)) { + mntput(m); return ERR_CAST(ns); + } + mnt->mnt_ns = ns; + ns->root = mnt; + ns->mounts++; + list_add(&mnt->mnt_list, &ns->list); - err = vfs_path_lookup(mnt->mnt_root, mnt, + err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); @@ -3228,6 +3316,7 @@ out0: static void __init init_mount_tree(void) { struct vfsmount *mnt; + struct mount *m; struct mnt_namespace *ns; struct path root; struct file_system_type *type; @@ -3240,10 +3329,14 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = create_mnt_ns(mnt); + ns = alloc_mnt_ns(&init_user_ns, false); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); - + m = real_mount(mnt); + m->mnt_ns = ns; + ns->root = m; + ns->mounts = 1; + list_add(&m->mnt_list, &ns->list); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); @@ -3297,10 +3390,10 @@ void put_mnt_ns(struct mnt_namespace *ns) free_mnt_ns(ns); } -struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) +struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; - mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data); + mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until @@ -3310,7 +3403,7 @@ struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) } return mnt; } -EXPORT_SYMBOL_GPL(kern_mount_data); +EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { @@ -3352,7 +3445,8 @@ bool current_chrooted(void) return chrooted; } -static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, +static bool mnt_already_visible(struct mnt_namespace *ns, + const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; @@ -3364,7 +3458,7 @@ static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new, struct mount *child; int mnt_flags; - if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type) + if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory @@ -3415,7 +3509,7 @@ found: return visible; } -static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) +static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; @@ -3425,7 +3519,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return false; /* Can this filesystem be too revealing? */ - s_iflags = mnt->mnt_sb->s_iflags; + s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; @@ -3435,7 +3529,7 @@ static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags) return true; } - return !mnt_already_visible(ns, mnt, new_mnt_flags); + return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) @@ -3484,6 +3578,9 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; + if (is_anon_ns(mnt_ns)) + return -EINVAL; + if (fs->users != 1) return -EINVAL; diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index a87a56273407..06233bfa6d73 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -72,16 +72,6 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p) return xdr_ressize_check(rqstp, p); } -static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes) -{ - __be32 *p; - - p = xdr_inline_decode(xdr, nbytes); - if (unlikely(p == NULL)) - printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); - return p; -} - static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len, const char **str, size_t maxlen) { @@ -98,13 +88,13 @@ static __be32 decode_fh(struct xdr_stream *xdr, struct nfs_fh *fh) { __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); fh->size = ntohl(*p); if (fh->size > NFS4_FHSIZE) return htonl(NFS4ERR_BADHANDLE); - p = read_buf(xdr, fh->size); + p = xdr_inline_decode(xdr, fh->size); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); memcpy(&fh->data[0], p, fh->size); @@ -117,11 +107,11 @@ static __be32 decode_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) __be32 *p; unsigned int attrlen; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); attrlen = ntohl(*p); - p = read_buf(xdr, attrlen << 2); + p = xdr_inline_decode(xdr, attrlen << 2); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); if (likely(attrlen > 0)) @@ -135,7 +125,7 @@ static __be32 decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) { __be32 *p; - p = read_buf(xdr, NFS4_STATEID_SIZE); + p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); memcpy(stateid->data, p, NFS4_STATEID_SIZE); @@ -156,7 +146,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound status = decode_string(xdr, &hdr->taglen, &hdr->tag, CB_OP_TAGLEN_MAXSZ); if (unlikely(status != 0)) return status; - p = read_buf(xdr, 12); + p = xdr_inline_decode(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); hdr->minorversion = ntohl(*p++); @@ -176,7 +166,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound static __be32 decode_op_hdr(struct xdr_stream *xdr, unsigned int *op) { __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE_HDR); *op = ntohl(*p); @@ -205,7 +195,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) return status; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); args->truncate = ntohl(*p); @@ -227,7 +217,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, __be32 status = 0; uint32_t iomode; - p = read_buf(xdr, 4 * sizeof(uint32_t)); + p = xdr_inline_decode(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -245,14 +235,14 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, if (unlikely(status != 0)) return status; - p = read_buf(xdr, 2 * sizeof(uint64_t)); + p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); return decode_layout_stateid(xdr, &args->cbl_stateid); } else if (args->cbl_recall_type == RETURN_FSID) { - p = read_buf(xdr, 2 * sizeof(uint64_t)); + p = xdr_inline_decode(xdr, 2 * sizeof(uint64_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_fsid.major); @@ -275,7 +265,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, args->ndevs = 0; /* Num of device notifications */ - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto out; @@ -298,7 +288,8 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, for (i = 0; i < n; i++) { struct cb_devicenotifyitem *dev = &args->devs[i]; - p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE); + p = xdr_inline_decode(xdr, (4 * sizeof(uint32_t)) + + NFS4_DEVICEID4_SIZE); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto err; @@ -329,7 +320,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp, p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) { - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) { status = htonl(NFS4ERR_BADXDR); goto err; @@ -359,7 +350,7 @@ static __be32 decode_sessionid(struct xdr_stream *xdr, { __be32 *p; - p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -379,13 +370,13 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, goto out; status = htonl(NFS4ERR_RESOURCE); - p = read_buf(xdr, sizeof(uint32_t)); + p = xdr_inline_decode(xdr, sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; rc_list->rcl_nrefcalls = ntohl(*p++); if (rc_list->rcl_nrefcalls) { - p = read_buf(xdr, + p = xdr_inline_decode(xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) goto out; @@ -418,7 +409,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, if (status) return status; - p = read_buf(xdr, 5 * sizeof(uint32_t)); + p = xdr_inline_decode(xdr, 5 * sizeof(uint32_t)); if (unlikely(p == NULL)) return htonl(NFS4ERR_RESOURCE); @@ -461,7 +452,7 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, uint32_t bitmap[2]; __be32 *p, status; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); @@ -480,7 +471,7 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp, struct cb_recallslotargs *args = argp; __be32 *p; - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->crsa_target_highest_slotid = ntohl(*p++); @@ -492,14 +483,14 @@ static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_arg __be32 *p; unsigned int len; - p = read_buf(xdr, 12); + p = xdr_inline_decode(xdr, 12); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbnl_owner.clientid); len = be32_to_cpu(*p); - p = read_buf(xdr, len); + p = xdr_inline_decode(xdr, len); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); @@ -537,7 +528,7 @@ static __be32 decode_write_response(struct xdr_stream *xdr, __be32 *p; /* skip the always zero field */ - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out; p++; @@ -577,7 +568,7 @@ static __be32 decode_offload_args(struct svc_rqst *rqstp, return status; /* decode status */ - p = read_buf(xdr, 4); + p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out; args->error = ntohl(*p++); @@ -943,10 +934,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) }; unsigned int nops = 0; - xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); + xdr_init_decode(&xdr_in, &rqstp->rq_arg, + rqstp->rq_arg.head[0].iov_base, NULL); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); - xdr_init_encode(&xdr_out, &rqstp->rq_res, p); + xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL); status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); if (status == htonl(NFS4ERR_RESOURCE)) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 885363ca8569..2f6b447cdd82 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -229,6 +229,8 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation spin_lock(&delegation->lock); if (delegation->inode != NULL) inode = igrab(delegation->inode); + if (!inode) + set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags); spin_unlock(&delegation->lock); return inode; } @@ -681,7 +683,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp) /** * nfs_super_return_all_delegations - return delegations for one superblock - * @sb: sb to process + * @server: pointer to nfs_server to process * */ void nfs_server_return_all_delegations(struct nfs_server *server) @@ -944,10 +946,11 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags)) - continue; - if (test_bit(NFS_DELEGATION_NEED_RECLAIM, + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) continue; if (!nfs_sb_active(server->super)) @@ -1053,10 +1056,11 @@ restart: list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { list_for_each_entry_rcu(delegation, &server->delegations, super_list) { - if (test_bit(NFS_DELEGATION_RETURNING, - &delegation->flags)) - continue; - if (test_bit(NFS_DELEGATION_TEST_EXPIRED, + if (test_bit(NFS_DELEGATION_INODE_FREEING, + &delegation->flags) || + test_bit(NFS_DELEGATION_RETURNING, + &delegation->flags) || + test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags) == 0) continue; if (!nfs_sb_active(server->super)) diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index dcbf3394ba0e..35b4b02c1ae0 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -34,6 +34,7 @@ enum { NFS_DELEGATION_RETURNING, NFS_DELEGATION_REVOKED, NFS_DELEGATION_TEST_EXPIRED, + NFS_DELEGATION_INODE_FREEING, }; int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6bf4471850c8..a71d0b42d160 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -139,12 +139,19 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; +struct readdirvec { + unsigned long nr; + unsigned long index; + struct page *pages[NFS_MAX_READDIR_RAPAGES]; +}; + typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool); typedef struct { struct file *file; struct page *page; struct dir_context *ctx; unsigned long page_index; + struct readdirvec pvec; u64 *dir_cookie; u64 last_cookie; loff_t current_index; @@ -524,6 +531,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en struct nfs_cache_array *array; unsigned int count = 0; int status; + int max_rapages = NFS_MAX_READDIR_RAPAGES; + + desc->pvec.index = desc->page_index; + desc->pvec.nr = 0; scratch = alloc_page(GFP_KERNEL); if (scratch == NULL) @@ -548,20 +559,40 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (desc->plus) nfs_prime_dcache(file_dentry(desc->file), entry); - status = nfs_readdir_add_to_array(entry, page); + status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); + if (status == -ENOSPC) { + desc->pvec.nr++; + if (desc->pvec.nr == max_rapages) + break; + status = nfs_readdir_add_to_array(entry, desc->pvec.pages[desc->pvec.nr]); + } if (status != 0) break; } while (!entry->eof); + /* + * page and desc->pvec.pages[0] are valid, don't need to check + * whether or not to be NULL. + */ + copy_highpage(page, desc->pvec.pages[0]); + out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = kmap(page); + array = kmap_atomic(desc->pvec.pages[desc->pvec.nr]); array->eof_index = array->size; status = 0; - kunmap(page); + kunmap_atomic(array); } put_page(scratch); + + /* + * desc->pvec.nr > 0 means at least one page was completely filled, + * we should return -ENOSPC. Otherwise function + * nfs_readdir_xdr_to_array will enter infinite loop. + */ + if (desc->pvec.nr > 0) + return -ENOSPC; return status; } @@ -574,8 +605,8 @@ void nfs_readdir_free_pages(struct page **pages, unsigned int npages) } /* - * nfs_readdir_large_page will allocate pages that must be freed with a call - * to nfs_readdir_free_pagearray + * nfs_readdir_alloc_pages() will allocate pages that must be freed with a call + * to nfs_readdir_free_pages() */ static int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages) @@ -595,6 +626,24 @@ out_freepages: return -ENOMEM; } +/* + * nfs_readdir_rapages_init initialize rapages by nfs_cache_array structure. + */ +static +void nfs_readdir_rapages_init(nfs_readdir_descriptor_t *desc) +{ + struct nfs_cache_array *array; + int max_rapages = NFS_MAX_READDIR_RAPAGES; + int index; + + for (index = 0; index < max_rapages; index++) { + array = kmap_atomic(desc->pvec.pages[index]); + memset(array, 0, sizeof(struct nfs_cache_array)); + array->eof_index = -1; + kunmap_atomic(array); + } +} + static int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode) { @@ -605,6 +654,12 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); + /* + * This means we hit readdir rdpages miss, the preallocated rdpages + * are useless, the preallocate rdpages should be reinitialized. + */ + nfs_readdir_rapages_init(desc); + entry.prev_cookie = 0; entry.cookie = desc->last_cookie; entry.eof = 0; @@ -664,9 +719,24 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) struct inode *inode = file_inode(desc->file); int ret; - ret = nfs_readdir_xdr_to_array(desc, page, inode); - if (ret < 0) - goto error; + /* + * If desc->page_index in range desc->pvec.index and + * desc->pvec.index + desc->pvec.nr, we get readdir cache hit. + */ + if (desc->page_index >= desc->pvec.index && + desc->page_index < (desc->pvec.index + desc->pvec.nr)) { + /* + * page and desc->pvec.pages[x] are valid, don't need to check + * whether or not to be NULL. + */ + copy_highpage(page, desc->pvec.pages[desc->page_index - desc->pvec.index]); + ret = 0; + } else { + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) + goto error; + } + SetPageUptodate(page); if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) { @@ -831,6 +901,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; int res = 0; + int max_rapages = NFS_MAX_READDIR_RAPAGES; dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n", file, (long long)ctx->pos); @@ -850,6 +921,12 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = nfs_use_readdirplus(inode, ctx); + res = nfs_readdir_alloc_pages(desc->pvec.pages, max_rapages); + if (res < 0) + return -ENOMEM; + + nfs_readdir_rapages_init(desc); + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) @@ -885,6 +962,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } while (!desc->eof); out: + nfs_readdir_free_pages(desc->pvec.pages, max_rapages); if (res > 0) res = 0; dfprintk(FILE, "NFS: readdir(%pD2) returns %d\n", file, res); @@ -945,7 +1023,7 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, /** * nfs_force_lookup_revalidate - Mark the directory as having changed - * @dir - pointer to directory inode + * @dir: pointer to directory inode * * This forces the revalidation code in nfs_lookup_revalidate() to do a * full lookup on all child dentries of 'dir' whenever a change occurs @@ -1649,7 +1727,7 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, reval_dentry: if (flags & LOOKUP_RCU) return -ECHILD; - return nfs_lookup_revalidate_dentry(dir, dentry, inode);; + return nfs_lookup_revalidate_dentry(dir, dentry, inode); full_reval: return nfs_do_lookup_revalidate(dir, dentry, flags); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 33824a0a57bf..0fd811ac08b5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -428,7 +428,7 @@ out_put: hdr->release(hdr); } -static void nfs_read_sync_pgio_error(struct list_head *head) +static void nfs_read_sync_pgio_error(struct list_head *head, int error) { struct nfs_page *req; @@ -664,8 +664,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) list_for_each_entry_safe(req, tmp, &reqs, wb_list) { if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_remove_request(req); - nfs_list_add_request(req, &failed); + nfs_list_move_request(req, &failed); spin_lock(&cinfo.inode->i_lock); dreq->flags = 0; if (desc.pg_error < 0) @@ -821,7 +820,7 @@ out_put: hdr->release(hdr); } -static void nfs_write_sync_pgio_error(struct list_head *head) +static void nfs_write_sync_pgio_error(struct list_head *head, int error) { struct nfs_page *req; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 29553fdba8af..4899b85f9b3c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -89,8 +89,8 @@ EXPORT_SYMBOL_GPL(nfs_file_release); /** * nfs_revalidate_size - Revalidate the file size - * @inode - pointer to inode struct - * @file - pointer to struct file + * @inode: pointer to inode struct + * @filp: pointer to struct file * * Revalidates the file length. This is basically a wrapper around * nfs_revalidate_inode() that takes into account the fact that we may @@ -276,6 +276,12 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * then a modify/write/read cycle when writing to a page in the * page cache. * + * Some pNFS layout drivers can only read/write at a certain block + * granularity like all block devices and therefore we must perform + * read/modify/write whenever a page hasn't read yet and the data + * to be written there is not aligned to a block boundary and/or + * smaller than the block size. + * * The modify/write/read cycle may occur if a page is read before * being completely filled by the writer. In this situation, the * page must be completely written to stable storage on the server @@ -291,26 +297,32 @@ EXPORT_SYMBOL_GPL(nfs_file_fsync); * and that the new data won't completely replace the old data in * that range of the file. */ -static int nfs_want_read_modify_write(struct file *file, struct page *page, - loff_t pos, unsigned len) +static bool nfs_full_page_write(struct page *page, loff_t pos, unsigned int len) { unsigned int pglen = nfs_page_length(page); unsigned int offset = pos & (PAGE_SIZE - 1); unsigned int end = offset + len; - if (pnfs_ld_read_whole_page(file->f_mapping->host)) { - if (!PageUptodate(page)) - return 1; - return 0; - } + return !pglen || (end >= pglen && !offset); +} - if ((file->f_mode & FMODE_READ) && /* open for read? */ - !PageUptodate(page) && /* Uptodate? */ - !PagePrivate(page) && /* i/o request already? */ - pglen && /* valid bytes of file? */ - (end < pglen || offset)) /* replace all valid bytes? */ - return 1; - return 0; +static bool nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned int len) +{ + /* + * Up-to-date pages, those with ongoing or full-page write + * don't need read/modify/write + */ + if (PageUptodate(page) || PagePrivate(page) || + nfs_full_page_write(page, pos, len)) + return false; + + if (pnfs_ld_read_whole_page(file->f_mapping->host)) + return true; + /* Open for reading too? */ + if (file->f_mode & FMODE_READ) + return true; + return false; } /* diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 63abe705f4ca..f9264e1922a2 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -410,7 +410,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; struct cred *kcred; - const struct cred *cred; + const struct cred __rcu *cred; kuid_t uid; kgid_t gid; u32 ds_count, fh_count, id; @@ -501,7 +501,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, goto out_err_free; kcred->fsuid = uid; kcred->fsgid = gid; - cred = kcred; + cred = RCU_INITIALIZER(kcred); if (lgr->range.iomode == IOMODE_READ) rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); @@ -788,30 +788,82 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, } } +static void +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + + if (devid) + nfs4_mark_deviceid_unavailable(devid); +} + +static void +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx) +{ + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + + if (devid) + nfs4_mark_deviceid_available(devid); +} + static struct nfs4_pnfs_ds * -ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - int start_idx, - int *best_idx) +ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx, + bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; bool fail_return = false; int idx; - /* mirrors are sorted by efficiency */ + /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { if (idx+1 == fls->mirror_array_cnt) - fail_return = true; - ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return); - if (ds) { - *best_idx = idx; - return ds; - } + fail_return = !check_device; + + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, fail_return); + if (!ds) + continue; + + if (check_device && + nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) + continue; + + *best_idx = idx; + return ds; } return NULL; } +static struct nfs4_pnfs_ds * +ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); +} + +static struct nfs4_pnfs_ds * +ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, + int start_idx, int *best_idx) +{ + struct nfs4_pnfs_ds *ds; + + ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); + if (ds) + return ds; + return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); +} + static void ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, @@ -925,7 +977,8 @@ retry: goto out_mds; for (i = 0; i < pgio->pg_mirror_count; i++) { - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true); if (!ds) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; @@ -936,7 +989,6 @@ retry: goto retry; } pgm = &pgio->pg_mirrors[i]; - mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; } @@ -1071,6 +1123,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, break; case -NFS4ERR_RETRY_UNCACHED_REP: break; + case -EAGAIN: + return -NFS4ERR_RESET_TO_PNFS; /* Invalidate Layout errors */ case -NFS4ERR_PNFS_NO_LAYOUT: case -ESTALE: /* mapped NFS4ERR_STALE */ @@ -1131,6 +1185,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EBADHANDLE: case -ELOOP: case -ENOSPC: + case -EAGAIN: break; case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); @@ -1158,8 +1213,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task, { int vers = clp->cl_nfs_mod->rpc_vers->number; - if (task->tk_status >= 0) + if (task->tk_status >= 0) { + ff_layout_mark_ds_reachable(lseg, idx); return 0; + } /* Handle the case of an invalid layout segment */ if (!pnfs_is_valid_lseg(lseg)) @@ -1222,6 +1279,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, GFP_NOIO); + if (status == NFS4ERR_NXIO) + ff_layout_mark_ds_unreachable(lseg, idx); pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); } @@ -1249,7 +1308,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task, if (ff_layout_choose_best_ds_for_read(hdr->lseg, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) - goto out_eagain; + goto out_layouterror; set_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1260,6 +1319,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, } return 0; +out_layouterror: + ff_layout_send_layouterror(hdr->lseg); out_eagain: rpc_restart_call_prepare(task); return -EAGAIN; @@ -1293,15 +1354,6 @@ ff_layout_set_layoutcommit(struct inode *inode, (unsigned long long) NFS_I(inode)->layout->plh_lwb); } -static bool -ff_layout_device_unavailable(struct pnfs_layout_segment *lseg, int idx) -{ - /* No mirroring for now */ - struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); - - return ff_layout_test_devid_unavailable(node); -} - static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -1332,10 +1384,6 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, rpc_exit(task, -EIO); return -EIO; } - if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { - rpc_exit(task, -EHOSTDOWN); - return -EAGAIN; - } ff_layout_read_record_layoutstats_start(task, hdr); return 0; @@ -1369,6 +1417,16 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) ff_layout_read_prepare_common(task, hdr); } +static void +ff_layout_io_prepare_transmit(struct rpc_task *task, + void *data) +{ + struct nfs_pgio_header *hdr = data; + + if (!pnfs_is_valid_lseg(hdr->lseg)) + rpc_exit(task, -EAGAIN); +} + static void ff_layout_read_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; @@ -1399,9 +1457,10 @@ static void ff_layout_read_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_read_record_layoutstats_done(&hdr->task, hdr); - if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { + ff_layout_send_layouterror(hdr->lseg); pnfs_read_resend_pnfs(hdr); - else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) ff_layout_reset_read(hdr); pnfs_generic_rw_release(data); } @@ -1513,11 +1572,6 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EIO; } - if (ff_layout_device_unavailable(hdr->lseg, hdr->pgio_mirror_idx)) { - rpc_exit(task, -EHOSTDOWN); - return -EAGAIN; - } - ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1573,9 +1627,10 @@ static void ff_layout_write_release(void *data) struct nfs_pgio_header *hdr = data; ff_layout_write_record_layoutstats_done(&hdr->task, hdr); - if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) + if (test_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags)) { + ff_layout_send_layouterror(hdr->lseg); ff_layout_reset_write(hdr, true); - else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) + } else if (test_bit(NFS_IOHDR_RESEND_MDS, &hdr->flags)) ff_layout_reset_write(hdr, false); pnfs_generic_rw_release(data); } @@ -1657,6 +1712,7 @@ static void ff_layout_commit_release(void *data) static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1664,6 +1720,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, .rpc_release = ff_layout_read_release, @@ -1671,6 +1728,7 @@ static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1678,6 +1736,7 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, + .rpc_call_prepare_transmit = ff_layout_io_prepare_transmit, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, .rpc_release = ff_layout_write_release, @@ -1703,6 +1762,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; u32 idx = hdr->pgio_mirror_idx; @@ -1713,20 +1773,21 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) __func__, hdr->inode->i_ino, hdr->args.pgbase, (size_t)hdr->args.count, offset); - ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); if (!ds) goto out_failed; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -1734,13 +1795,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->pgio_done_cb = ff_layout_read_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + fh = nfs4_ff_layout_select_ds_fh(mirror); if (fh) hdr->args.fh = fh; - if (vers == 4 && - !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) - goto out_failed; + nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1770,26 +1829,28 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) struct pnfs_layout_segment *lseg = hdr->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; loff_t offset = hdr->args.offset; int vers; struct nfs_fh *fh; int idx = hdr->pgio_mirror_idx; - ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); if (!ds) goto out_failed; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, @@ -1800,13 +1861,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; - fh = nfs4_ff_layout_select_ds_fh(lseg, idx); + fh = nfs4_ff_layout_select_ds_fh(mirror); if (fh) hdr->args.fh = fh; - if (vers == 4 && - !nfs4_ff_layout_select_ds_stateid(lseg, idx, &hdr->args.stateid)) - goto out_failed; + nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1849,6 +1908,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct pnfs_layout_segment *lseg = data->lseg; struct nfs4_pnfs_ds *ds; struct rpc_clnt *ds_clnt; + struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; u32 idx; int vers, ret; @@ -1859,20 +1919,21 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); - ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); + mirror = FF_LAYOUT_COMP(lseg, idx); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); if (!ds) goto out_err; - ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, + ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, data->inode); if (IS_ERR(ds_clnt)) goto out_err; - ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred); if (!ds_cred) goto out_err; - vers = nfs4_ff_layout_ds_version(lseg, idx); + vers = nfs4_ff_layout_ds_version(mirror); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), @@ -2036,7 +2097,7 @@ ff_layout_encode_layoutreturn(struct xdr_stream *xdr, dprintk("%s: Begin\n", __func__); - xdr_init_encode(&tmp_xdr, &tmp_buf, NULL); + xdr_init_encode(&tmp_xdr, &tmp_buf, NULL, NULL); ff_layout_encode_ioerr(&tmp_xdr, args, ff_args); ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args); @@ -2102,6 +2163,52 @@ out_nomem: return -ENOMEM; } +#ifdef CONFIG_NFS_V4_2 +void +ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) +{ + struct pnfs_layout_hdr *lo = lseg->pls_layout; + struct nfs42_layout_error *errors; + LIST_HEAD(head); + + if (!nfs_server_capable(lo->plh_inode, NFS_CAP_LAYOUTERROR)) + return; + ff_layout_fetch_ds_ioerr(lo, &lseg->pls_range, &head, -1); + if (list_empty(&head)) + return; + + errors = kmalloc_array(NFS42_LAYOUTERROR_MAX, + sizeof(*errors), GFP_NOFS); + if (errors != NULL) { + const struct nfs4_ff_layout_ds_err *pos; + size_t n = 0; + + list_for_each_entry(pos, &head, list) { + errors[n].offset = pos->offset; + errors[n].length = pos->length; + nfs4_stateid_copy(&errors[n].stateid, &pos->stateid); + errors[n].errors[0].dev_id = pos->deviceid; + errors[n].errors[0].status = pos->status; + errors[n].errors[0].opnum = pos->opnum; + n++; + if (!list_is_last(&pos->list, &head) && + n < NFS42_LAYOUTERROR_MAX) + continue; + if (nfs42_proc_layouterror(lseg, errors, n) < 0) + break; + n = 0; + } + kfree(errors); + } + ff_layout_free_ds_ioerr(&head); +} +#else +void +ff_layout_send_layouterror(struct pnfs_layout_segment *lseg) +{ +} +#endif + static int ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) { diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index c2626bad466b..2f369966abf7 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -132,16 +132,6 @@ FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } -static inline struct nfs4_deviceid_node * -FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) -{ - if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt || - FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL || - FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL) - return NULL; - return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node; -} - static inline struct nfs4_ff_layout_ds * FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) { @@ -151,9 +141,25 @@ FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) static inline struct nfs4_ff_layout_mirror * FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) { - if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt) - return NULL; - return FF_LAYOUT_LSEG(lseg)->mirror_array[idx]; + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); + + if (idx < fls->mirror_array_cnt) + return fls->mirror_array[idx]; + return NULL; +} + +static inline struct nfs4_deviceid_node * +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +{ + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx); + + if (mirror != NULL) { + struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds; + + if (!IS_ERR_OR_NULL(mirror_ds)) + return &mirror_ds->id_node; + } + return NULL; } static inline u32 @@ -174,28 +180,10 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_READ_IO; } -static inline bool -ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) -{ - /* - * Flexfiles should never mark a DS unavailable, but if it does - * print a (ratelimited) warning as this can affect performance. - */ - if (nfs4_test_deviceid_unavailable(node)) { - u32 *p = (u32 *)node->deviceid.data; - - pr_warn_ratelimited("NFS: flexfiles layout referencing an " - "unavailable device [%x%x%x%x]\n", - p[0], p[1], p[2], p[3]); - return true; - } - return false; -} - static inline int -nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx) +nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror) { - return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version; + return mirror->mirror_ds->ds_versions[0].version; } struct nfs4_ff_layout_ds * @@ -207,6 +195,7 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_mirror *mirror, u64 offset, u64 length, int status, enum nfs_opnum4 opnum, gfp_t gfp_flags); +void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg); int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); void ff_layout_free_ds_ioerr(struct list_head *head); unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, @@ -214,23 +203,23 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, struct list_head *head, unsigned int maxnum); struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); -int -nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, - u32 mirror_idx, - nfs4_stateid *stateid); +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror); +void +nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * -nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror, bool fail_return); struct rpc_clnt * -nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, - u32 ds_idx, +nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, struct inode *inode); -const struct cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, - u32 ds_idx, const struct cred *mdscred); +const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, + const struct pnfs_layout_range *range, + const struct cred *mdscred); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 11766a74216d..a809989807d6 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -183,56 +183,6 @@ out_err: return NULL; } -static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, - struct nfs4_deviceid_node *devid) -{ - nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); - if (!ff_layout_has_available_ds(lseg)) - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, - lseg); -} - -static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, - struct nfs4_ff_layout_mirror *mirror, - bool create) -{ - if (mirror == NULL || IS_ERR(mirror->mirror_ds)) - goto outerr; - if (mirror->mirror_ds == NULL) { - if (create) { - struct nfs4_deviceid_node *node; - struct pnfs_layout_hdr *lh = lseg->pls_layout; - struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); - - node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), - &mirror->devid, lh->plh_lc_cred, - GFP_KERNEL); - if (node) - mirror_ds = FF_LAYOUT_MIRROR_DS(node); - - /* check for race with another call to this function */ - if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && - mirror_ds != ERR_PTR(-ENODEV)) - nfs4_put_deviceid_node(node); - } else - goto outerr; - } - - if (IS_ERR(mirror->mirror_ds)) - goto outerr; - - if (mirror->mirror_ds->ds == NULL) { - struct nfs4_deviceid_node *devid; - devid = &mirror->mirror_ds->id_node; - ff_layout_mark_devid_invalid(lseg, devid); - return false; - } - return true; -outerr: - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); - return false; -} - static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, u64 offset, u64 length) { @@ -326,7 +276,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, spin_lock(&flo->generic_hdr.plh_inode->i_lock); ff_layout_add_ds_error_locked(flo, dserr); spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - return 0; } @@ -353,46 +302,54 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) } struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); - struct nfs_fh *fh = NULL; - - if (!ff_layout_mirror_valid(lseg, mirror, false)) { - pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", - __func__, mirror_idx); - goto out; - } - /* FIXME: For now assume there is only 1 version available for the DS */ - fh = &mirror->fh_versions[0]; -out: - return fh; + return &mirror->fh_versions[0]; } -int -nfs4_ff_layout_select_ds_stateid(struct pnfs_layout_segment *lseg, - u32 mirror_idx, - nfs4_stateid *stateid) +void +nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, + nfs4_stateid *stateid) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); + if (nfs4_ff_layout_ds_version(mirror) == 4) + nfs4_stateid_copy(stateid, &mirror->stateid); +} - if (!ff_layout_mirror_valid(lseg, mirror, false)) { - pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", - __func__, mirror_idx); - goto out; +static bool +ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo, + struct nfs4_ff_layout_mirror *mirror) +{ + if (mirror == NULL) + goto outerr; + if (mirror->mirror_ds == NULL) { + struct nfs4_deviceid_node *node; + struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); + + node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), + &mirror->devid, lo->plh_lc_cred, + GFP_KERNEL); + if (node) + mirror_ds = FF_LAYOUT_MIRROR_DS(node); + + /* check for race with another call to this function */ + if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + mirror_ds != ERR_PTR(-ENODEV)) + nfs4_put_deviceid_node(node); } - nfs4_stateid_copy(stateid, &mirror->stateid); - return 1; -out: - return 0; + if (IS_ERR(mirror->mirror_ds)) + goto outerr; + + return true; +outerr: + return false; } /** * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on - * @ds_idx: index of the DS to use + * @mirror: layout mirror describing the DS to use * @fail_return: return layout on connect failure? * * Try to prepare a DS connection to accept an RPC call. This involves @@ -407,26 +364,18 @@ out: * Returns a pointer to a connected DS object on success or NULL on failure. */ struct nfs4_pnfs_ds * -nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, + struct nfs4_ff_layout_mirror *mirror, bool fail_return) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); struct nfs4_pnfs_ds *ds = NULL; - struct nfs4_deviceid_node *devid; struct inode *ino = lseg->pls_layout->plh_inode; struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; int status; - if (!ff_layout_mirror_valid(lseg, mirror, true)) { - pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", - __func__, ds_idx); - goto out; - } - - devid = &mirror->mirror_ds->id_node; - if (ff_layout_test_devid_unavailable(devid)) - goto out_fail; + if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror)) + goto noconnect; ds = mirror->mirror_ds->ds; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -437,8 +386,8 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - status = nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, - dataserver_retrans, + status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node, + dataserver_timeo, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, mirror->mirror_ds->ds_versions[0].minor_version); @@ -453,11 +402,12 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].wsize = max_payload; goto out; } -out_fail: +noconnect: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); + ff_layout_send_layouterror(lseg); if (fail_return || !ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(ino, lseg); ds = NULL; @@ -466,14 +416,14 @@ out: } const struct cred * -ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, +ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, + const struct pnfs_layout_range *range, const struct cred *mdscred) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); const struct cred *cred; if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { - cred = ff_layout_get_mirror_cred(mirror, lseg->pls_range.iomode); + cred = ff_layout_get_mirror_cred(mirror, range->iomode); if (!cred) cred = get_cred(mdscred); } else { @@ -483,15 +433,18 @@ ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, } /** -* Find or create a DS rpc client with th MDS server rpc client auth flavor -* in the nfs_client cl_ds_clients list. -*/ + * nfs4_ff_find_or_create_ds_client - Find or create a DS rpc client + * @mirror: pointer to the mirror + * @ds_clp: nfs_client for the DS + * @inode: pointer to inode + * + * Find or create a DS rpc client with th MDS server rpc client auth flavor + * in the nfs_client cl_ds_clients list. + */ struct rpc_clnt * -nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, +nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, struct inode *inode) { - struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - switch (mirror->mirror_ds->ds_versions[0].version) { case 3: /* For NFSv3 DS, flavor is set when creating DS connections */ @@ -608,7 +561,7 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) if (IS_ERR(mirror->mirror_ds)) continue; devid = &mirror->mirror_ds->id_node; - if (!ff_layout_test_devid_unavailable(devid)) + if (!nfs4_test_deviceid_unavailable(devid)) return true; } } @@ -629,7 +582,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) if (!mirror->mirror_ds) continue; devid = &mirror->mirror_ds->id_node; - if (ff_layout_test_devid_unavailable(devid)) + if (nfs4_test_deviceid_unavailable(devid)) return false; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 094775ea0781..414a90d48493 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -143,6 +143,7 @@ EXPORT_SYMBOL_GPL(nfs_sync_inode); /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk + * @mapping: pointer to struct address_space */ int nfs_sync_mapping(struct address_space *mapping) { @@ -1184,8 +1185,8 @@ int nfs_attribute_cache_expired(struct inode *inode) /** * nfs_revalidate_inode - Revalidate the inode attributes - * @server - pointer to nfs_server struct - * @inode - pointer to inode struct + * @server: pointer to nfs_server struct + * @inode: pointer to inode struct * * Updates inode attribute information by retrieving the data from the server. */ @@ -1255,8 +1256,8 @@ out: /** * nfs_revalidate_mapping - Revalidate the pagecache - * @inode - pointer to host inode - * @mapping - pointer to mapping + * @inode: pointer to host inode + * @mapping: pointer to mapping */ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) @@ -1371,8 +1372,8 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) /** * nfs_check_inode_attributes - verify consistency of the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * Verifies the attribute cache. If we have just changed the attributes, * so that fattr carries weak cache consistency data, then it may @@ -1572,8 +1573,8 @@ EXPORT_SYMBOL_GPL(_nfs_display_fhandle); /** * nfs_inode_attrs_need_update - check if the inode attributes need updating - * @inode - pointer to inode - * @fattr - attributes + * @inode: pointer to inode + * @fattr: attributes * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. @@ -1614,8 +1615,8 @@ static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr /** * nfs_refresh_inode - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * Check that an RPC call that returned attributes has not overlapped with * other recent updates of the inode metadata, then decide whether it is @@ -1649,8 +1650,8 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, /** * nfs_post_op_update_inode - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. @@ -1679,8 +1680,8 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up @@ -1731,8 +1732,8 @@ out_noforce: /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache - * @inode - pointer to inode - * @fattr - updated attributes + * @inode: pointer to inode + * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index b1e577302518..c7cf23ae6597 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -69,7 +69,8 @@ struct nfs_clone_mount { * Maximum number of pages that readdir can use for creating * a vmapped array of pages. */ -#define NFS_MAX_READDIR_PAGES 8 +#define NFS_MAX_READDIR_PAGES 64 +#define NFS_MAX_READDIR_RAPAGES 8 struct nfs_client_initdata { unsigned long init_flags; @@ -755,6 +756,7 @@ static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: + case -EINTR: case -EACCES: case -EDQUOT: case -EFBIG: @@ -763,6 +765,7 @@ static inline bool nfs_error_is_fatal(int err) case -EROFS: case -ESTALE: case -E2BIG: + case -ENOMEM: return true; default: return false; diff --git a/fs/nfs/io.c b/fs/nfs/io.c index 9034b4926909..5088fda9b453 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -25,7 +25,7 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) /** * nfs_start_io_read - declare the file is being used for buffered reads - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. @@ -56,7 +56,7 @@ nfs_start_io_read(struct inode *inode) /** * nfs_end_io_read - declare that the buffered read operation is done - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is done, and release the shared * lock on inode->i_rwsem. @@ -69,7 +69,7 @@ nfs_end_io_read(struct inode *inode) /** * nfs_start_io_write - declare the file is being used for buffered writes - * @inode - file inode + * @inode: file inode * * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. @@ -83,7 +83,7 @@ nfs_start_io_write(struct inode *inode) /** * nfs_end_io_write - declare that the buffered write operation is done - * @inode - file inode + * @inode: file inode * * Declare that a buffered write operation is done, and release the * lock on inode->i_rwsem. @@ -105,7 +105,7 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) /** * nfs_end_io_direct - declare the file is being used for direct i/o - * @inode - file inode + * @inode: file inode * * Declare that a direct I/O operation is about to start, and ensure * that we block all buffered I/O. @@ -136,7 +136,7 @@ nfs_start_io_direct(struct inode *inode) /** * nfs_end_io_direct - declare that the direct i/o operation is done - * @inode - file inode + * @inode: file inode * * Declare that a direct I/O operation is done, and release the shared * lock on inode->i_rwsem. diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e5686be67be8..15f099a24c29 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -221,10 +221,10 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, /** * nfs_do_submount - set up mountpoint when crossing a filesystem boundary - * @dentry - parent directory - * @fh - filehandle for new root dentry - * @fattr - attributes for new root inode - * @authflavor - security flavor to use when performing the mount + * @dentry: parent directory + * @fh: filehandle for new root dentry + * @fattr: attributes for new root inode + * @authflavor: security flavor to use when performing the mount * */ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 350675e3ed47..a7ed29de0a40 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -22,6 +22,7 @@ #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs_fs.h> +#include "nfstrace.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -55,42 +56,16 @@ #define NFS_attrstat_sz (1+NFS_fattr_sz) #define NFS_diropres_sz (1+NFS_fhandle_sz+NFS_fattr_sz) -#define NFS_readlinkres_sz (2) -#define NFS_readres_sz (1+NFS_fattr_sz+1) +#define NFS_readlinkres_sz (2+1) +#define NFS_readres_sz (1+NFS_fattr_sz+1+1) #define NFS_writeres_sz (NFS_attrstat_sz) #define NFS_stat_sz (1) -#define NFS_readdirres_sz (1) +#define NFS_readdirres_sz (1+1) #define NFS_statfsres_sz (1+NFS_info_sz) static int nfs_stat_to_errno(enum nfs_stat); /* - * While encoding arguments, set up the reply buffer in advance to - * receive reply data directly into the page cache. - */ -static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, - unsigned int base, unsigned int len, - unsigned int bufsize) -{ - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); -} - -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NFSv2 basic data types * * Basic NFSv2 data types are defined in section 2.3 of RFC 1094: @@ -110,8 +85,8 @@ static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); recvd = xdr_read_pages(xdr, count); if (unlikely(count > recvd)) @@ -125,9 +100,6 @@ out_cheating: "count %u > recvd %u\n", count, recvd); count = recvd; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -157,13 +129,16 @@ static int decode_stat(struct xdr_stream *xdr, enum nfs_stat *status) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != cpu_to_be32(NFS_OK))) + goto out_status; + *status = 0; + return 0; +out_status: *status = be32_to_cpup(p); + trace_nfs_xdr_status((int)*status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -205,14 +180,11 @@ static int decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh) __be32 *p; p = xdr_inline_decode(xdr, NFS2_FHSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fh->size = NFS2_FHSIZE; memcpy(fh->data, p, NFS2_FHSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -282,8 +254,8 @@ static int decode_fattr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS_fattr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fattr->valid |= NFS_ATTR_FATTR_V2; @@ -325,9 +297,6 @@ out_uid: out_gid: dprintk("NFS: returned invalid gid\n"); return -EINVAL; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -416,23 +385,20 @@ static int decode_filename_inline(struct xdr_stream *xdr, u32 count; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (count > NFS3_MAXNAMLEN) goto out_nametoolong; p = xdr_inline_decode(xdr, count); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *name = (const char *)p; *length = count; return 0; out_nametoolong: dprintk("NFS: returned filename too long: %u\n", count); return -ENAMETOOLONG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -455,8 +421,8 @@ static int decode_path(struct xdr_stream *xdr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; length = be32_to_cpup(p); if (unlikely(length >= xdr->buf->page_len || length > NFS_MAXPATHLEN)) goto out_size; @@ -472,9 +438,6 @@ out_cheating: dprintk("NFS: server cheating in pathname result: " "length %u > received %u\n", length, recvd); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -615,8 +578,8 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req, const struct nfs_readlinkargs *args = data; encode_fhandle(xdr, args->fh); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->pglen, NFS_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, NFS_readlinkres_sz); } /* @@ -651,8 +614,8 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req, const struct nfs_pgio_args *args = data; encode_readargs(xdr, args); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, NFS_readres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, NFS_readres_sz); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -809,8 +772,8 @@ static void nfs2_xdr_enc_readdirargs(struct rpc_rqst *req, const struct nfs_readdirargs *args = data; encode_readdirargs(xdr, args); - prepare_reply_buffer(req, args->pages, 0, - args->count, NFS_readdirres_sz); + rpc_prepare_reply_pages(req, args->pages, 0, + args->count, NFS_readdirres_sz); } /* @@ -951,12 +914,12 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, int error; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p++ == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p++ == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -964,8 +927,8 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, } p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; entry->ino = be32_to_cpup(p); error = decode_filename_inline(xdr, &entry->name, &entry->len); @@ -978,17 +941,13 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, */ entry->prev_cookie = entry->cookie; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; entry->cookie = be32_to_cpup(p); entry->d_type = DT_UNKNOWN; return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; } /* @@ -1052,17 +1011,14 @@ static int decode_info(struct xdr_stream *xdr, struct nfs2_fsstat *result) __be32 *p; p = xdr_inline_decode(xdr, NFS_info_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->tsize = be32_to_cpup(p++); result->bsize = be32_to_cpup(p++); result->blocks = be32_to_cpup(p++); result->bfree = be32_to_cpup(p++); result->bavail = be32_to_cpup(p); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs2_xdr_dec_statfsres(struct rpc_rqst *req, struct xdr_stream *xdr, diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9fce18548f7e..c5c3fc6e6c60 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -222,8 +222,6 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, switch (status) { case 0: status = nfs_refresh_inode(inode, fattr); - set_cached_acl(inode, ACL_TYPE_ACCESS, acl); - set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 78df4eb60f85..110358f4986d 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -21,6 +21,7 @@ #include <linux/nfs3.h> #include <linux/nfs_fs.h> #include <linux/nfsacl.h> +#include "nfstrace.h" #include "internal.h" #define NFSDBG_FACILITY NFSDBG_XDR @@ -68,13 +69,13 @@ #define NFS3_removeres_sz (NFS3_setattrres_sz) #define NFS3_lookupres_sz (1+NFS3_fh_sz+(2 * NFS3_post_op_attr_sz)) #define NFS3_accessres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1) -#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3) +#define NFS3_readlinkres_sz (1+NFS3_post_op_attr_sz+1+1) +#define NFS3_readres_sz (1+NFS3_post_op_attr_sz+3+1) #define NFS3_writeres_sz (1+NFS3_wcc_data_sz+4) #define NFS3_createres_sz (1+NFS3_fh_sz+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) #define NFS3_renameres_sz (1+(2 * NFS3_wcc_data_sz)) #define NFS3_linkres_sz (1+NFS3_post_op_attr_sz+NFS3_wcc_data_sz) -#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2) +#define NFS3_readdirres_sz (1+NFS3_post_op_attr_sz+2+1) #define NFS3_fsstatres_sz (1+NFS3_post_op_attr_sz+13) #define NFS3_fsinfores_sz (1+NFS3_post_op_attr_sz+12) #define NFS3_pathconfres_sz (1+NFS3_post_op_attr_sz+6) @@ -84,7 +85,7 @@ #define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ - XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)+1) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) static int nfs3_stat_to_errno(enum nfs_stat); @@ -104,32 +105,6 @@ static const umode_t nfs_type2fmt[] = { }; /* - * While encoding arguments, set up the reply buffer in advance to - * receive reply data directly into the page cache. - */ -static void prepare_reply_buffer(struct rpc_rqst *req, struct page **pages, - unsigned int base, unsigned int len, - unsigned int bufsize) -{ - struct rpc_auth *auth = req->rq_cred->cr_auth; - unsigned int replen; - - replen = RPC_REPHDRSIZE + auth->au_rslack + bufsize; - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, pages, base, len); -} - -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - - -/* * Encode/decode NFSv3 basic data types * * Basic NFSv3 data types are defined in section 2.5 of RFC 1813: @@ -151,13 +126,10 @@ static int decode_uint32(struct xdr_stream *xdr, u32 *value) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *value = be32_to_cpup(p); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_uint64(struct xdr_stream *xdr, u64 *value) @@ -165,13 +137,10 @@ static int decode_uint64(struct xdr_stream *xdr, u64 *value) __be32 *p; p = xdr_inline_decode(xdr, 8); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; xdr_decode_hyper(p, value); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -211,14 +180,14 @@ static int decode_inline_filename3(struct xdr_stream *xdr, u32 count; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (count > NFS3_MAXNAMLEN) goto out_nametoolong; p = xdr_inline_decode(xdr, count); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; *name = (const char *)p; *length = count; return 0; @@ -226,9 +195,6 @@ static int decode_inline_filename3(struct xdr_stream *xdr, out_nametoolong: dprintk("NFS: returned filename too long: %u\n", count); return -ENAMETOOLONG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -249,8 +215,8 @@ static int decode_nfspath3(struct xdr_stream *xdr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p); if (unlikely(count >= xdr->buf->page_len || count > NFS3_MAXPATHLEN)) goto out_nametoolong; @@ -267,9 +233,6 @@ out_cheating: dprintk("NFS: server cheating in pathname result: " "count %u > recvd %u\n", count, recvd); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -303,13 +266,10 @@ static int decode_cookieverf3(struct xdr_stream *xdr, __be32 *verifier) __be32 *p; p = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; memcpy(verifier, p, NFS3_COOKIEVERFSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -330,13 +290,10 @@ static int decode_writeverf3(struct xdr_stream *xdr, struct nfs_write_verifier * __be32 *p; p = xdr_inline_decode(xdr, NFS3_WRITEVERFSIZE); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; memcpy(verifier->data, p, NFS3_WRITEVERFSIZE); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -364,13 +321,16 @@ static int decode_nfsstat3(struct xdr_stream *xdr, enum nfs_stat *status) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; + if (unlikely(*p != cpu_to_be32(NFS3_OK))) + goto out_status; + *status = 0; + return 0; +out_status: *status = be32_to_cpup(p); + trace_nfs_xdr_status((int)*status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -453,23 +413,20 @@ static int decode_nfs_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; length = be32_to_cpup(p++); if (unlikely(length > NFS3_FHSIZE)) goto out_toobig; p = xdr_inline_decode(xdr, length); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fh->size = length; memcpy(fh->data, p, length); return 0; out_toobig: dprintk("NFS: file handle size (%u) too big\n", length); return -E2BIG; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static void zero_nfs_fh3(struct nfs_fh *fh) @@ -655,8 +612,8 @@ static int decode_fattr3(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS3_fattr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; p = xdr_decode_ftype3(p, &fmode); @@ -690,9 +647,6 @@ out_uid: out_gid: dprintk("NFS: returned invalid gid\n"); return -EINVAL; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -710,14 +664,11 @@ static int decode_post_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_fattr3(xdr, fattr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -733,8 +684,8 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, NFS3_wcc_attr_sz << 2); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; fattr->valid |= NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PRECHANGE @@ -747,9 +698,6 @@ static int decode_wcc_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) fattr->pre_change_attr = nfs_timespec_to_change_attr(&fattr->pre_ctime); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -773,14 +721,11 @@ static int decode_pre_op_attr(struct xdr_stream *xdr, struct nfs_fattr *fattr) __be32 *p; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_wcc_attr(xdr, fattr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_wcc_data(struct xdr_stream *xdr, struct nfs_fattr *fattr) @@ -808,15 +753,12 @@ out: static int decode_post_op_fh3(struct xdr_stream *xdr, struct nfs_fh *fh) { __be32 *p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; if (*p != xdr_zero) return decode_nfs_fh3(xdr, fh); zero_nfs_fh3(fh); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -953,8 +895,8 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req, const struct nfs3_readlinkargs *args = data; encode_nfs_fh3(xdr, args->fh); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->pglen, NFS3_readlinkres_sz); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, NFS3_readlinkres_sz); } /* @@ -986,8 +928,8 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req, unsigned int replen = args->replen ? args->replen : NFS3_readres_sz; encode_read3args(xdr, args); - prepare_reply_buffer(req, args->pages, args->pgbase, - args->count, replen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, replen); req->rq_rcv_buf.flags |= XDRBUF_READ; } @@ -1279,7 +1221,7 @@ static void nfs3_xdr_enc_readdir3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdir3args(xdr, args); - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, args->count, NFS3_readdirres_sz); } @@ -1321,7 +1263,7 @@ static void nfs3_xdr_enc_readdirplus3args(struct rpc_rqst *req, const struct nfs3_readdirargs *args = data; encode_readdirplus3args(xdr, args); - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, args->count, NFS3_readdirres_sz); } @@ -1366,7 +1308,7 @@ static void nfs3_xdr_enc_getacl3args(struct rpc_rqst *req, encode_nfs_fh3(xdr, args->fh); encode_uint32(xdr, args->mask); if (args->mask & (NFS_ACL | NFS_DFACL)) { - prepare_reply_buffer(req, args->pages, 0, + rpc_prepare_reply_pages(req, args->pages, 0, NFSACL_MAXPAGES << PAGE_SHIFT, ACL3_getaclres_sz); req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; @@ -1643,8 +1585,8 @@ static int decode_read3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 + 4 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; count = be32_to_cpup(p++); eof = be32_to_cpup(p++); ocount = be32_to_cpup(p++); @@ -1667,9 +1609,6 @@ out_cheating: count = recvd; eof = 0; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -1690,7 +1629,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr, result->op_status = status; if (status != NFS3_OK) goto out_status; - result->replen = 3 + ((xdr_stream_pos(xdr) - pos) >> 2); + result->replen = 4 + ((xdr_stream_pos(xdr) - pos) >> 2); error = decode_read3resok(xdr, result); out: return error; @@ -1731,22 +1670,18 @@ static int decode_write3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->count = be32_to_cpup(p++); result->verf->committed = be32_to_cpup(p++); if (unlikely(result->verf->committed > NFS_FILE_SYNC)) goto out_badvalue; if (decode_writeverf3(xdr, &result->verf->verifier)) - goto out_eio; + return -EIO; return result->count; out_badvalue: dprintk("NFS: bad stable_how value: %u\n", result->verf->committed); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); -out_eio: - return -EIO; } static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -2010,12 +1945,12 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, u64 new_cookie; p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -2051,8 +1986,8 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EAGAIN; if (*p != xdr_zero) { error = decode_nfs_fh3(xdr, entry->fh); if (unlikely(error)) { @@ -2069,9 +2004,6 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; out_truncated: dprintk("NFS: directory entry contains invalid file handle\n"); *entry = old; @@ -2183,8 +2115,8 @@ static int decode_fsstat3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 8 * 6 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; p = xdr_decode_size3(p, &result->tbytes); p = xdr_decode_size3(p, &result->fbytes); p = xdr_decode_size3(p, &result->abytes); @@ -2193,9 +2125,6 @@ static int decode_fsstat3resok(struct xdr_stream *xdr, xdr_decode_size3(p, &result->afiles); /* ignore invarsec */ return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_fsstat3res(struct rpc_rqst *req, @@ -2255,8 +2184,8 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 * 7 + 8 + 8 + 4); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->rtmax = be32_to_cpup(p++); result->rtpref = be32_to_cpup(p++); result->rtmult = be32_to_cpup(p++); @@ -2270,9 +2199,6 @@ static int decode_fsinfo3resok(struct xdr_stream *xdr, /* ignore properties */ result->lease_time = 0; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_fsinfo3res(struct rpc_rqst *req, @@ -2328,15 +2254,12 @@ static int decode_pathconf3resok(struct xdr_stream *xdr, __be32 *p; p = xdr_inline_decode(xdr, 4 * 6); - if (unlikely(p == NULL)) - goto out_overflow; + if (unlikely(!p)) + return -EIO; result->max_link = be32_to_cpup(p++); result->max_namelen = be32_to_cpup(p); /* ignore remaining fields */ return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int nfs3_xdr_dec_pathconf3res(struct rpc_rqst *req, diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 19ec38f85ce0..901cca7542f9 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -20,5 +20,8 @@ loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t); +int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, + const struct nfs42_layout_error *errors, + size_t n); #endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index fed06fd9998d..ff6f85fb676b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -672,6 +672,170 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, return 0; } +static struct nfs42_layouterror_data * +nfs42_alloc_layouterror_data(struct pnfs_layout_segment *lseg, gfp_t gfp_flags) +{ + struct nfs42_layouterror_data *data; + struct inode *inode = lseg->pls_layout->plh_inode; + + data = kzalloc(sizeof(*data), gfp_flags); + if (data) { + data->args.inode = data->inode = nfs_igrab_and_active(inode); + if (data->inode) { + data->lseg = pnfs_get_lseg(lseg); + if (data->lseg) + return data; + nfs_iput_and_deactive(data->inode); + } + kfree(data); + } + return NULL; +} + +static void +nfs42_free_layouterror_data(struct nfs42_layouterror_data *data) +{ + pnfs_put_lseg(data->lseg); + nfs_iput_and_deactive(data->inode); + kfree(data); +} + +static void +nfs42_layouterror_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + struct inode *inode = data->inode; + struct nfs_server *server = NFS_SERVER(inode); + struct pnfs_layout_hdr *lo = data->lseg->pls_layout; + unsigned i; + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + rpc_exit(task, 0); + return; + } + for (i = 0; i < data->args.num_errors; i++) + nfs4_stateid_copy(&data->args.errors[i].stateid, + &lo->plh_stateid); + spin_unlock(&inode->i_lock); + nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + &data->res.seq_res, task); +} + +static void +nfs42_layouterror_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + struct inode *inode = data->inode; + struct pnfs_layout_hdr *lo = data->lseg->pls_layout; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -NFS4ERR_BADHANDLE: + case -ESTALE: + pnfs_destroy_layout(NFS_I(inode)); + break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match(&data->args.errors[0].stateid, + &lo->plh_stateid)) { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + pnfs_mark_layout_stateid_invalid(lo, &head); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + nfs_commit_inode(inode, 0); + } else + spin_unlock(&inode->i_lock); + break; + case -NFS4ERR_OLD_STATEID: + spin_lock(&inode->i_lock); + if (pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(&data->args.errors[0].stateid, + &lo->plh_stateid)) { + /* Do we need to delay before resending? */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, + &data->args.errors[0].stateid)) + rpc_delay(task, HZ); + rpc_restart_call_prepare(task); + } + spin_unlock(&inode->i_lock); + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTERROR; + } +} + +static void +nfs42_layouterror_release(void *calldata) +{ + struct nfs42_layouterror_data *data = calldata; + + nfs42_free_layouterror_data(data); +} + +static const struct rpc_call_ops nfs42_layouterror_ops = { + .rpc_call_prepare = nfs42_layouterror_prepare, + .rpc_call_done = nfs42_layouterror_done, + .rpc_release = nfs42_layouterror_release, +}; + +int nfs42_proc_layouterror(struct pnfs_layout_segment *lseg, + const struct nfs42_layout_error *errors, size_t n) +{ + struct inode *inode = lseg->pls_layout->plh_inode; + struct nfs42_layouterror_data *data; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTERROR], + }; + struct rpc_task_setup task_setup = { + .rpc_message = &msg, + .callback_ops = &nfs42_layouterror_ops, + .flags = RPC_TASK_ASYNC, + }; + unsigned int i; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTERROR)) + return -EOPNOTSUPP; + if (n > NFS42_LAYOUTERROR_MAX) + return -EINVAL; + data = nfs42_alloc_layouterror_data(lseg, GFP_NOFS); + if (!data) + return -ENOMEM; + for (i = 0; i < n; i++) { + data->args.errors[i] = errors[i]; + data->args.num_errors++; + data->res.num_errors++; + } + msg.rpc_argp = &data->args; + msg.rpc_resp = &data->res; + task_setup.callback_data = data; + task_setup.rpc_client = NFS_SERVER(inode)->client; + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0, 0); + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} +EXPORT_SYMBOL_GPL(nfs42_proc_layouterror); + static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, struct file *dst_f, struct nfs_lock_context *src_lock, struct nfs_lock_context *dst_lock, loff_t src_offset, diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 69f72ed2bf87..aed865a84629 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -51,6 +51,15 @@ 1 /* opaque devaddr4 length */ + \ XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) #define decode_layoutstats_maxsz (op_decode_hdr_maxsz) +#define encode_device_error_maxsz (XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ + 1 /* status */ + 1 /* opnum */) +#define encode_layouterror_maxsz (op_decode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + encode_stateid_maxsz + \ + 1 /* Array size */ + \ + encode_device_error_maxsz) +#define decode_layouterror_maxsz (op_decode_hdr_maxsz) #define encode_clone_maxsz (encode_stateid_maxsz + \ encode_stateid_maxsz + \ 2 /* src offset */ + \ @@ -59,43 +68,53 @@ #define decode_clone_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_allocate_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_allocate_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_copy_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ encode_copy_maxsz + \ encode_commit_maxsz) #define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ decode_copy_maxsz + \ decode_commit_maxsz) #define NFS4_enc_offload_cancel_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_offload_cancel_maxsz) #define NFS4_dec_offload_cancel_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_offload_cancel_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) #define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_seek_maxsz) #define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \ @@ -106,6 +125,16 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) +#define NFS4_enc_layouterror_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + NFS42_LAYOUTERROR_MAX * \ + encode_layouterror_maxsz) +#define NFS4_dec_layouterror_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + NFS42_LAYOUTERROR_MAX * \ + decode_layouterror_maxsz) #define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -223,6 +252,34 @@ static void encode_clone(struct xdr_stream *xdr, xdr_encode_hyper(p, args->count); } +static void encode_device_error(struct xdr_stream *xdr, + const struct nfs42_device_error *error) +{ + __be32 *p; + + p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 2*4); + p = xdr_encode_opaque_fixed(p, error->dev_id.data, + NFS4_DEVICEID4_SIZE); + *p++ = cpu_to_be32(error->status); + *p = cpu_to_be32(error->opnum); +} + +static void encode_layouterror(struct xdr_stream *xdr, + const struct nfs42_layout_error *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTERROR, decode_layouterror_maxsz, hdr); + p = reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, args->offset); + p = xdr_encode_hyper(p, args->length); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(1); + encode_device_error(xdr, &args->errors[0]); +} + /* * Encode ALLOCATE request */ @@ -381,6 +438,27 @@ static void nfs4_xdr_enc_clone(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode LAYOUTERROR request + */ +static void nfs4_xdr_enc_layouterror(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_layouterror_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + int i; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, NFS_FH(args->inode), &hdr); + for (i = 0; i < args->num_errors; i++) + encode_layouterror(xdr, &args->errors[i], &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -394,7 +472,7 @@ static int decode_write_response(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; count = be32_to_cpup(p); if (count > 1) return -EREMOTEIO; @@ -402,18 +480,14 @@ static int decode_write_response(struct xdr_stream *xdr, status = decode_opaque_fixed(xdr, &res->stateid, NFS4_STATEID_SIZE); if (unlikely(status)) - goto out_overflow; + return -EIO; } p = xdr_inline_decode(xdr, 8 + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->count); res->verifier.committed = be32_to_cpup(p); return decode_verifier(xdr, &res->verifier.verifier); - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_copy_requirements(struct xdr_stream *xdr, @@ -422,14 +496,11 @@ static int decode_copy_requirements(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4 + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->consecutive = be32_to_cpup(p++); res->synchronous = be32_to_cpup(p++); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_copy(struct xdr_stream *xdr, struct nfs42_copy_res *res) @@ -474,15 +545,11 @@ static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) p = xdr_inline_decode(xdr, 4 + 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->sr_eof = be32_to_cpup(p++); p = xdr_decode_hyper(p, &res->sr_offset); return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutstats(struct xdr_stream *xdr) @@ -495,6 +562,11 @@ static int decode_clone(struct xdr_stream *xdr) return decode_op_hdr(xdr, OP_CLONE); } +static int decode_layouterror(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_LAYOUTERROR); +} + /* * Decode ALLOCATE request */ @@ -704,4 +776,30 @@ out: return status; } +/* + * Decode LAYOUTERROR request + */ +static int nfs4_xdr_dec_layouterror(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_layouterror_res *res = data; + struct compound_hdr hdr; + int status, i; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + + for (i = 0; i < res->num_errors && status == 0; i++) + status = decode_layouterror(xdr); +out: + res->rpc_status = status; + return status; +} + #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 2548405da1f7..1339ede979af 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -42,7 +42,7 @@ static int nfs_get_cb_ident_idr(struct nfs_client *clp, int minorversion) } #ifdef CONFIG_NFS_V4_1 -/** +/* * Per auth flavor data server rpc clients */ struct nfs4_ds_server { @@ -51,7 +51,9 @@ struct nfs4_ds_server { }; /** - * Common lookup case for DS I/O + * nfs4_find_ds_client - Common lookup case for DS I/O + * @ds_clp: pointer to the DS's nfs_client + * @flavor: rpc auth flavour to match */ static struct nfs4_ds_server * nfs4_find_ds_client(struct nfs_client *ds_clp, rpc_authflavor_t flavor) @@ -118,9 +120,13 @@ nfs4_free_ds_server(struct nfs4_ds_server *dss) } /** -* Find or create a DS rpc client with th MDS server rpc client auth flavor -* in the nfs_client cl_ds_clients list. -*/ + * nfs4_find_or_create_ds_client - Find or create a DS rpc client + * @ds_clp: pointer to the DS's nfs_client + * @inode: pointer to the inode + * + * Find or create a DS rpc client with th MDS server rpc client auth flavor + * in the nfs_client cl_ds_clients list. + */ struct rpc_clnt * nfs4_find_or_create_ds_client(struct nfs_client *ds_clp, struct inode *inode) { @@ -145,7 +151,6 @@ static void nfs4_shutdown_ds_clients(struct nfs_client *clp) { struct nfs4_ds_server *dss; - LIST_HEAD(shutdown_list); while (!list_empty(&clp->cl_ds_clients)) { dss = list_entry(clp->cl_ds_clients.next, @@ -284,7 +289,7 @@ static int nfs4_init_callback(struct nfs_client *clp) /** * nfs40_init_client - nfs_client initialization tasks for NFSv4.0 - * @clp - nfs_client to initialize + * @clp: nfs_client to initialize * * Returns zero on success, or a negative errno if some error occurred. */ @@ -312,7 +317,7 @@ int nfs40_init_client(struct nfs_client *clp) /** * nfs41_init_client - nfs_client initialization tasks for NFSv4.1+ - * @clp - nfs_client to initialize + * @clp: nfs_client to initialize * * Returns zero on success, or a negative errno if some error occurred. */ @@ -360,9 +365,7 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) * nfs4_init_client - Initialise an NFS4 client record * * @clp: nfs_client to initialise - * @timeparms: timeout parameters for underlying RPC transport - * @ip_addr: callback IP address in presentation format - * @authflavor: authentication flavor for underlying RPC transport + * @cl_init: pointer to nfs_client_initdata * * Returns pointer to an NFS client, or an ERR_PTR value. */ @@ -649,13 +652,13 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1, /** * nfs4_detect_session_trunking - Checks for session trunking. - * - * Called after a successful EXCHANGE_ID on a multi-addr connection. - * Upon success, add the transport. - * * @clp: original mount nfs_client * @res: result structure from an exchange_id using the original mount * nfs_client with a new multi_addr transport + * @xprt: pointer to the transport to add. + * + * Called after a successful EXCHANGE_ID on a multi-addr connection. + * Upon success, add the transport. * * Returns zero on success, otherwise -EINVAL * diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 24f06dcc2b08..2e460c33ae48 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -137,6 +137,7 @@ static size_t nfs_parse_server_name(char *string, size_t len, /** * nfs_find_best_sec - Find a security mechanism supported locally + * @clnt: pointer to rpc_clnt * @server: NFS server struct * @flavors: List of security tuples returned by SECINFO procedure * @@ -288,8 +289,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, /** * nfs_follow_referral - set up mountpoint when hitting a referral on moved error - * @dentry - parent directory - * @locations - array of NFSv4 server location information + * @dentry: parent directory + * @locations: array of NFSv4 server location information * */ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 557a5d636183..4dbb0ee23432 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -730,33 +730,41 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) res->sr_slot = NULL; } +static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot, + u32 seqnr) +{ + if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0) + slot->seq_nr_highest_sent = seqnr; +} +static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, + u32 seqnr) +{ + slot->seq_nr_highest_sent = seqnr; + slot->seq_nr_last_acked = seqnr; +} + static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { struct nfs4_session *session; struct nfs4_slot *slot = res->sr_slot; struct nfs_client *clp; - bool interrupted = false; int ret = 1; if (slot == NULL) goto out_noaction; /* don't increment the sequence number if the task wasn't sent */ - if (!RPC_WAS_SENT(task)) + if (!RPC_WAS_SENT(task) || slot->seq_done) goto out; session = slot->table->session; - if (slot->interrupted) { - if (res->sr_status != -NFS4ERR_DELAY) - slot->interrupted = 0; - interrupted = true; - } - trace_nfs4_sequence_done(session, res); /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: + /* Mark this sequence number as having been acked */ + nfs4_slot_sequence_acked(slot, slot->seq_nr); /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; @@ -771,9 +779,9 @@ static int nfs41_sequence_process(struct rpc_task *task, * sr_status remains 1 if an RPC level error occurred. * The server may or may not have processed the sequence * operation.. - * Mark the slot as having hosted an interrupted RPC call. */ - slot->interrupted = 1; + nfs4_slot_sequence_record_sent(slot, slot->seq_nr); + slot->seq_done = 1; goto out; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -784,6 +792,7 @@ static int nfs41_sequence_process(struct rpc_task *task, __func__, slot->slot_nr, slot->seq_nr); + nfs4_slot_sequence_acked(slot, slot->seq_nr); goto out_retry; case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_SEQ_FALSE_RETRY: @@ -791,6 +800,7 @@ static int nfs41_sequence_process(struct rpc_task *task, * The server thinks we tried to replay a request. * Retry the call after bumping the sequence ID. */ + nfs4_slot_sequence_acked(slot, slot->seq_nr); goto retry_new_seq; case -NFS4ERR_BADSLOT: /* @@ -801,21 +811,28 @@ static int nfs41_sequence_process(struct rpc_task *task, goto session_recover; goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: + nfs4_slot_sequence_record_sent(slot, slot->seq_nr); /* - * Was the last operation on this sequence interrupted? - * If so, retry after bumping the sequence number. - */ - if (interrupted) - goto retry_new_seq; - /* - * Could this slot have been previously retired? - * If so, then the server may be expecting seq_nr = 1! + * Were one or more calls using this slot interrupted? + * If the server never received the request, then our + * transmitted slot sequence number may be too high. */ - if (slot->seq_nr != 1) { - slot->seq_nr = 1; + if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) { + slot->seq_nr--; goto retry_nowait; } - goto session_recover; + /* + * RFC5661: + * A retry might be sent while the original request is + * still in progress on the replier. The replier SHOULD + * deal with the issue by returning NFS4ERR_DELAY as the + * reply to SEQUENCE or CB_SEQUENCE operation, but + * implementations MAY return NFS4ERR_SEQ_MISORDERED. + * + * Restart the search after a delay. + */ + slot->seq_nr = slot->seq_nr_highest_sent; + goto out_retry; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -906,17 +923,6 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static void -nfs4_sequence_process_interrupted(struct nfs_client *client, - struct nfs4_slot *slot, const struct cred *cred) -{ - struct rpc_task *task; - - task = _nfs41_proc_sequence(client, cred, slot, true); - if (!IS_ERR(task)) - rpc_put_task_async(task); -} - #else /* !CONFIG_NFS_V4_1 */ static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -937,16 +943,15 @@ int nfs4_sequence_done(struct rpc_task *task, } EXPORT_SYMBOL_GPL(nfs4_sequence_done); -static void -nfs4_sequence_process_interrupted(struct nfs_client *client, - struct nfs4_slot *slot, const struct cred *cred) +#endif /* !CONFIG_NFS_V4_1 */ + +static void nfs41_sequence_res_init(struct nfs4_sequence_res *res) { - WARN_ON_ONCE(1); - slot->interrupted = 0; + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; } -#endif /* !CONFIG_NFS_V4_1 */ - static void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -958,10 +963,6 @@ void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, args->sa_slot = slot; res->sr_slot = slot; - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - res->sr_status = 1; - } int nfs4_setup_sequence(struct nfs_client *client, @@ -982,31 +983,25 @@ int nfs4_setup_sequence(struct nfs_client *client, task->tk_timeout = 0; } - for (;;) { - spin_lock(&tbl->slot_tbl_lock); - /* The state manager will wait until the slot table is empty */ - if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) - goto out_sleep; - - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - /* Try again in 1/4 second */ - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - goto out_sleep; - } - spin_unlock(&tbl->slot_tbl_lock); + spin_lock(&tbl->slot_tbl_lock); + /* The state manager will wait until the slot table is empty */ + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; - if (likely(!slot->interrupted)) - break; - nfs4_sequence_process_interrupted(client, - slot, task->tk_msg.rpc_cred); + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* Try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; } + spin_unlock(&tbl->slot_tbl_lock); nfs4_sequence_attach_slot(args, res, slot); trace_nfs4_setup_sequence(session, args); out_start: + nfs41_sequence_res_init(res); rpc_call_start(task); return 0; @@ -1555,6 +1550,10 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, static void nfs_set_open_stateid_locked(struct nfs4_state *state, const nfs4_stateid *stateid, nfs4_stateid *freeme) + __must_hold(&state->owner->so_lock) + __must_hold(&state->seqlock) + __must_hold(RCU) + { DEFINE_WAIT(wait); int status = 0; @@ -5963,7 +5962,7 @@ out: /** * nfs4_proc_setclientid_confirm - Confirm client ID * @clp: state data structure - * @res: result of a previous SETCLIENTID + * @arg: result of a previous SETCLIENTID * @cred: credential to use for this call * * Returns zero, a negative errno, or a negative NFS4ERR status code. @@ -7527,7 +7526,7 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) return status; } -/** +/* * If 'use_integrity' is true and the state managment nfs_client * cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient * and the machine credential as per RFC3530bis and RFC5661 Security @@ -8937,10 +8936,12 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) if (status != 0) goto out; - /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ - if (task->tk_status < 0 || lgp->res.layoutp->len == 0) { + if (task->tk_status < 0) { status = nfs4_layoutget_handle_exception(task, lgp, &exception); *timeout = exception.timeout; + } else if (lgp->res.layoutp->len == 0) { + status = -EAGAIN; + *timeout = nfs4_update_delay(&exception.timeout); } else lseg = pnfs_layout_process(lgp); out: @@ -9219,7 +9220,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) return status; } -/** +/* * Use the state managment nfs_client cl_rpcclient, which uses krb5i (if * possible) as per RFC3530bis and RFC5661 Security Considerations sections */ @@ -9484,7 +9485,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { * @server: server / transport on which to perform the operation * @stateid: state ID to release * @cred: credential - * @is_recovery: set to true if this call needs to be privileged + * @privileged: set to true if this call needs to be privileged * * Note: this function is always asynchronous. */ @@ -9691,7 +9692,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_DEALLOCATE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS - | NFS_CAP_CLONE, + | NFS_CAP_CLONE + | NFS_CAP_LAYOUTERROR, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index a5489d70a724..bcb532def9e2 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -55,7 +55,7 @@ static void nfs4_shrink_slot_table(struct nfs4_slot_table *tbl, u32 newsize) /** * nfs4_slot_tbl_drain_complete - wake waiters when drain is complete - * @tbl - controlling slot table + * @tbl: controlling slot table * */ void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl) @@ -110,6 +110,8 @@ static struct nfs4_slot *nfs4_new_slot(struct nfs4_slot_table *tbl, slot->table = tbl; slot->slot_nr = slotid; slot->seq_nr = seq_init; + slot->seq_nr_highest_sent = seq_init; + slot->seq_nr_last_acked = seq_init - 1; } return slot; } @@ -276,7 +278,8 @@ static void nfs4_reset_slot_table(struct nfs4_slot_table *tbl, p = &tbl->slots; while (*p) { (*p)->seq_nr = ivalue; - (*p)->interrupted = 0; + (*p)->seq_nr_highest_sent = ivalue; + (*p)->seq_nr_last_acked = ivalue - 1; p = &(*p)->next; } tbl->highest_used_slotid = NFS4_NO_SLOT; diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 3c550f297561..b996ee23f1ba 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -10,7 +10,7 @@ /* maximum number of slots to use */ #define NFS4_DEF_SLOT_TABLE_SIZE (64U) -#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U) +#define NFS4_DEF_CB_SLOT_TABLE_SIZE (16U) #define NFS4_MAX_SLOT_TABLE (1024U) #define NFS4_NO_SLOT ((u32)-1) @@ -23,8 +23,9 @@ struct nfs4_slot { unsigned long generation; u32 slot_nr; u32 seq_nr; - unsigned int interrupted : 1, - privileged : 1, + u32 seq_nr_last_acked; + u32 seq_nr_highest_sent; + unsigned int privileged : 1, seq_done : 1; }; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 02488b50534a..3de36479ed7a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -563,6 +563,7 @@ static void nfs4_gc_state_owners(struct nfs_server *server) * nfs4_get_state_owner - Look up a state owner given a credential * @server: nfs_server to search * @cred: RPC credential to match + * @gfp_flags: allocation mode * * Returns a pointer to an instantiated nfs4_state_owner struct, or NULL. */ diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index b4557cf685fb..cd1a5c08da9a 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -524,6 +524,31 @@ TRACE_EVENT(nfs4_setup_sequence, ) ); +TRACE_EVENT(nfs4_xdr_status, + TP_PROTO( + u32 op, + int error + ), + + TP_ARGS(op, error), + + TP_STRUCT__entry( + __field(u32, op) + __field(int, error) + ), + + TP_fast_assign( + __entry->op = op; + __entry->error = -error; + ), + + TP_printk( + "operation %d: nfs status %d (%s)", + __entry->op, + __entry->error, show_nfsv4_errors(__entry->error) + ) +); + DECLARE_EVENT_CLASS(nfs4_open_event, TP_PROTO( const struct nfs_open_context *ctx, diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 2fc8f6fa25e4..cfcabc33e24d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -54,6 +54,7 @@ #include <linux/nfs_fs.h> #include "nfs4_fs.h" +#include "nfs4trace.h" #include "internal.h" #include "nfs4idmap.h" #include "nfs4session.h" @@ -214,14 +215,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, nfs4_fattr_bitmap_maxsz) #define encode_read_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) -#define decode_read_maxsz (op_decode_hdr_maxsz + 2) +#define decode_read_maxsz (op_decode_hdr_maxsz + 2 + 1) #define encode_readdir_maxsz (op_encode_hdr_maxsz + \ 2 + encode_verifier_maxsz + 5 + \ nfs4_label_maxsz) #define decode_readdir_maxsz (op_decode_hdr_maxsz + \ - decode_verifier_maxsz) + decode_verifier_maxsz + 1) #define encode_readlink_maxsz (op_encode_hdr_maxsz) -#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1) +#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1 + 1) #define encode_write_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 4) #define decode_write_maxsz (op_decode_hdr_maxsz + \ @@ -283,14 +284,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define decode_delegreturn_maxsz (op_decode_hdr_maxsz) #define encode_getacl_maxsz (encode_getattr_maxsz) #define decode_getacl_maxsz (op_decode_hdr_maxsz + \ - nfs4_fattr_bitmap_maxsz + 1) + nfs4_fattr_bitmap_maxsz + 1 + 1) #define encode_setacl_maxsz (op_encode_hdr_maxsz + \ encode_stateid_maxsz + 3) #define decode_setacl_maxsz (decode_setattr_maxsz) #define encode_fs_locations_maxsz \ (encode_getattr_maxsz) #define decode_fs_locations_maxsz \ - (0) + (1) #define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz) #define decode_secinfo_maxsz (op_decode_hdr_maxsz + 1 + ((NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)) / 4)) @@ -391,12 +392,13 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, 1 /* opaque devaddr4 length */ + \ /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ - 1 /* notification bitmap, word 0 */) + 1 /* notification bitmap, word 0 */ + \ + 1 /* possible XDR padding */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ decode_stateid_maxsz + \ - XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) + XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE) + 1) #define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \ 2 /* offset */ + \ 2 /* length */ + \ @@ -1015,12 +1017,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - struct rpc_auth *auth = req->rq_cred->cr_auth; /* initialize running count of expected bytes in reply. * NOTE: the replied tag SHOULD be the same is the one sent, * but this is not required as a MUST for the server to do so. */ - hdr->replen = RPC_REPHDRSIZE + auth->au_rslack + 3 + hdr->taglen; + hdr->replen = 3 + hdr->taglen; WARN_ON_ONCE(hdr->taglen > NFS4_MAXTAGLEN); encode_string(xdr, hdr->taglen, hdr->tag); @@ -2340,9 +2341,9 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); if (args->lg_args) { encode_layoutget(xdr, args->lg_args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->lg_args->layout.pages, - 0, args->lg_args->layout.pglen); + rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, + args->lg_args->layout.pglen, + hdr.replen); } encode_nops(&hdr); } @@ -2386,9 +2387,9 @@ static void nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); if (args->lg_args) { encode_layoutget(xdr, args->lg_args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->lg_args->layout.pages, - 0, args->lg_args->layout.pglen); + rpc_prepare_reply_pages(req, args->lg_args->layout.pages, 0, + args->lg_args->layout.pglen, + hdr.replen); } encode_nops(&hdr); } @@ -2498,8 +2499,8 @@ static void nfs4_xdr_enc_readlink(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_readlink(xdr, args, req, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, - args->pgbase, args->pglen); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->pglen, hdr.replen); encode_nops(&hdr); } @@ -2519,11 +2520,8 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_readdir(xdr, args, req, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, args->pages, - args->pgbase, args->count); - dprintk("%s: inlined page args = (%u, %p, %u, %u)\n", - __func__, hdr.replen << 2, args->pages, - args->pgbase, args->count); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen); encode_nops(&hdr); } @@ -2543,8 +2541,8 @@ static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_read(xdr, args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->pages, args->pgbase, args->count); + rpc_prepare_reply_pages(req, args->pages, args->pgbase, + args->count, hdr.replen); req->rq_rcv_buf.flags |= XDRBUF_READ; encode_nops(&hdr); } @@ -2590,9 +2588,8 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr, encode_getattr(xdr, nfs4_acl_bitmap, NULL, ARRAY_SIZE(nfs4_acl_bitmap), &hdr); - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - args->acl_pages, 0, args->acl_len); - + rpc_prepare_reply_pages(req, args->acl_pages, 0, + args->acl_len, replen); encode_nops(&hdr); } @@ -2813,9 +2810,8 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, encode_fs_locations(xdr, args->bitmask, &hdr); } - /* Set up reply kvec to capture returned fs_locations array. */ - xdr_inline_pages(&req->rq_rcv_buf, replen << 2, - (struct page **)&args->page, 0, PAGE_SIZE); + rpc_prepare_reply_pages(req, (struct page **)&args->page, 0, + PAGE_SIZE, replen); encode_nops(&hdr); } @@ -3017,10 +3013,8 @@ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, /* set up reply kvec. Subtract notification bitmap max size (2) * so that notification bitmap is put in xdr_buf tail */ - xdr_inline_pages(&req->rq_rcv_buf, (hdr.replen - 2) << 2, - args->pdev->pages, args->pdev->pgbase, - args->pdev->pglen); - + rpc_prepare_reply_pages(req, args->pdev->pages, args->pdev->pgbase, + args->pdev->pglen, hdr.replen - 2); encode_nops(&hdr); } @@ -3041,9 +3035,8 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req, encode_putfh(xdr, NFS_FH(args->inode), &hdr); encode_layoutget(xdr, args, &hdr); - xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2, - args->layout.pages, 0, args->layout.pglen); - + rpc_prepare_reply_pages(req, args->layout.pages, 0, + args->layout.pglen, hdr.replen); encode_nops(&hdr); } @@ -3144,22 +3137,12 @@ static void nfs4_xdr_enc_free_stateid(struct rpc_rqst *req, } #endif /* CONFIG_NFS_V4_1 */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("nfs: %s: prematurely hit end of receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { ssize_t ret = xdr_stream_decode_opaque_inline(xdr, (void **)string, NFS4_OPAQUE_LIMIT); - if (unlikely(ret < 0)) { - if (ret == -EBADMSG) - print_overflow_msg(__func__, xdr); + if (unlikely(ret < 0)) return -EIO; - } *len = ret; return 0; } @@ -3170,22 +3153,19 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; hdr->status = be32_to_cpup(p++); hdr->taglen = be32_to_cpup(p); p = xdr_inline_decode(xdr, hdr->taglen + 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); hdr->nops = be32_to_cpup(p); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, @@ -3201,11 +3181,14 @@ static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected, opnum = be32_to_cpup(p++); if (unlikely(opnum != expected)) goto out_bad_operation; + if (unlikely(*p != cpu_to_be32(NFS_OK))) + goto out_status; + *nfs_retval = 0; + return true; +out_status: nfserr = be32_to_cpup(p); - if (nfserr == NFS_OK) - *nfs_retval = 0; - else - *nfs_retval = nfs4_stat_to_errno(nfserr); + trace_nfs4_xdr_status(opnum, nfserr); + *nfs_retval = nfs4_stat_to_errno(nfserr); return true; out_bad_operation: dprintk("nfs: Server returned operation" @@ -3214,7 +3197,6 @@ out_bad_operation: *nfs_retval = -EREMOTEIO; return false; out_overflow: - print_overflow_msg(__func__, xdr); *nfs_retval = -EIO; return false; } @@ -3235,10 +3217,9 @@ static int decode_ace(struct xdr_stream *xdr, void *ace) char *str; p = xdr_inline_decode(xdr, 12); - if (likely(p)) - return decode_opaque_inline(xdr, &strlen, &str); - print_overflow_msg(__func__, xdr); - return -EIO; + if (unlikely(!p)) + return -EIO; + return decode_opaque_inline(xdr, &strlen, &str); } static ssize_t @@ -3249,10 +3230,9 @@ decode_bitmap4(struct xdr_stream *xdr, uint32_t *bitmap, size_t sz) ret = xdr_stream_decode_uint32_array(xdr, bitmap, sz); if (likely(ret >= 0)) return ret; - if (ret == -EMSGSIZE) - return sz; - print_overflow_msg(__func__, xdr); - return -EIO; + if (ret != -EMSGSIZE) + return -EIO; + return sz; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -3268,13 +3248,10 @@ static int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, unsigne p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *attrlen = be32_to_cpup(p); *savep = xdr_stream_pos(xdr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -3303,7 +3280,7 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); @@ -3314,9 +3291,6 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fh_expire_type(struct xdr_stream *xdr, @@ -3330,15 +3304,12 @@ static int decode_attr_fh_expire_type(struct xdr_stream *xdr, if (likely(bitmap[0] & FATTR4_WORD0_FH_EXPIRE_TYPE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *type = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_FH_EXPIRE_TYPE; } dprintk("%s: expire type=0x%x\n", __func__, *type); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -3352,7 +3323,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; @@ -3360,9 +3331,6 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -3376,16 +3344,13 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3398,15 +3363,12 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3419,15 +3381,12 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -3442,7 +3401,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { p = xdr_inline_decode(xdr, 16); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &fsid->major); xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; @@ -3452,9 +3411,6 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3467,15 +3423,12 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t *res) @@ -3487,14 +3440,11 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap, int32_t * if (likely(bitmap[0] & FATTR4_WORD0_RDATTR_ERROR)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; *res = -be32_to_cpup(p); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_exclcreat_supported(struct xdr_stream *xdr, @@ -3526,13 +3476,13 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru if (likely(bitmap[0] & FATTR4_WORD0_FILEHANDLE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; if (fh != NULL) { memcpy(fh->data, p, len); fh->size = len; @@ -3540,9 +3490,6 @@ static int decode_attr_filehandle(struct xdr_stream *xdr, uint32_t *bitmap, stru bitmap[0] &= ~FATTR4_WORD0_FILEHANDLE; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3555,15 +3502,12 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -3577,16 +3521,13 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -3600,16 +3541,13 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_MOUNTED_ON_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3623,15 +3561,12 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3645,15 +3580,12 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3667,15 +3599,12 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -3686,7 +3615,7 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; n = be32_to_cpup(p); if (n == 0) goto root_path; @@ -3718,9 +3647,6 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -3745,7 +3671,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st goto out; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + goto out_eio; n = be32_to_cpup(p); if (n <= 0) goto out_eio; @@ -3758,7 +3684,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st loc = &res->locations[res->nlocations]; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + goto out_eio; m = be32_to_cpup(p); dprintk("%s: servers:\n", __func__); @@ -3796,8 +3722,6 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; -out_overflow: - print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -3814,15 +3738,12 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -3836,15 +3757,12 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -3858,15 +3776,12 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3881,7 +3796,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ uint64_t maxread; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; @@ -3890,9 +3805,6 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3907,7 +3819,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 uint64_t maxwrite; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; @@ -3916,9 +3828,6 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -3933,7 +3842,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; @@ -3941,9 +3850,6 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3957,16 +3863,13 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static ssize_t decode_nfs4_string(struct xdr_stream *xdr, @@ -4011,10 +3914,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, return NFS_ATTR_FATTR_OWNER; } out: - if (len != -EBADMSG) - return 0; - print_overflow_msg(__func__, xdr); - return -EIO; + if (len == -EBADMSG) + return -EIO; + return 0; } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, @@ -4046,10 +3948,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, return NFS_ATTR_FATTR_GROUP; } out: - if (len != -EBADMSG) - return 0; - print_overflow_msg(__func__, xdr); - return -EIO; + if (len == -EBADMSG) + return -EIO; + return 0; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -4066,7 +3967,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; major = be32_to_cpup(p++); minor = be32_to_cpup(p); tmp = MKDEV(major, minor); @@ -4077,9 +3978,6 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4093,15 +3991,12 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4115,15 +4010,12 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -4137,15 +4029,12 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -4159,7 +4048,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; @@ -4167,9 +4056,6 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static __be32 * @@ -4189,12 +4075,9 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) p = xdr_inline_decode(xdr, nfstime4_maxsz << 2); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_nfstime4(p, time); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -4265,19 +4148,19 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, if (likely(bitmap[2] & FATTR4_WORD2_SECURITY_LABEL)) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; lfs = be32_to_cpup(p++); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; pi = be32_to_cpup(p++); p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p++); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; if (len < NFS4_MAXLABELLEN) { if (label) { memcpy(label->label, p, len); @@ -4295,10 +4178,6 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap, dprintk("%s: label=%s, len=%d, PI=%d, LFS=%d\n", __func__, (char *)label->label, label->len, label->pi, label->lfs); return status; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -4342,14 +4221,11 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c p = xdr_inline_decode(xdr, 20); if (unlikely(!p)) - goto out_overflow; + return -EIO; cinfo->atomic = be32_to_cpup(p++); p = xdr_decode_hyper(p, &cinfo->before); xdr_decode_hyper(p, &cinfo->after); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) @@ -4363,24 +4239,19 @@ static int decode_access(struct xdr_stream *xdr, u32 *supported, u32 *access) return status; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; supp = be32_to_cpup(p++); acc = be32_to_cpup(p); *supported = supp; *access = acc; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { ssize_t ret = xdr_stream_decode_opaque_fixed(xdr, buf, len); - if (unlikely(ret < 0)) { - print_overflow_msg(__func__, xdr); + if (unlikely(ret < 0)) return -EIO; - } return 0; } @@ -4460,13 +4331,11 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; bmlen = be32_to_cpup(p); p = xdr_inline_decode(xdr, bmlen << 2); if (likely(p)) return 0; -out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -4574,13 +4443,10 @@ static int decode_threshold_hint(struct xdr_stream *xdr, if (likely(bitmap[0] & hint_bit)) { p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, res); } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_first_threshold_item4(struct xdr_stream *xdr, @@ -4593,10 +4459,8 @@ static int decode_first_threshold_item4(struct xdr_stream *xdr, /* layout type */ p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } res->l_type = be32_to_cpup(p); /* thi_hintset bitmap */ @@ -4654,7 +4518,7 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, return -EREMOTEIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; num = be32_to_cpup(p); if (num == 0) return 0; @@ -4667,9 +4531,6 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, bitmap[2] &= ~FATTR4_WORD2_MDSTHRESHOLD; } return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, @@ -4857,7 +4718,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; fsinfo->nlayouttypes = be32_to_cpup(p); /* pNFS is not supported by the underlying file system */ @@ -4867,7 +4728,7 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, /* Decode and set first layout type, move xdr->p past unused types */ p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; /* If we get too many, then just cap it at the max */ if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) { @@ -4879,9 +4740,6 @@ static int decode_pnfs_layout_types(struct xdr_stream *xdr, for(i = 0; i < fsinfo->nlayouttypes; ++i) fsinfo->layouttype[i] = be32_to_cpup(p++); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } /* @@ -4915,10 +4773,8 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, *res = 0; if (bitmap[2] & FATTR4_WORD2_LAYOUT_BLKSIZE) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_LAYOUT_BLKSIZE; } @@ -4937,10 +4793,8 @@ static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap, *res = 0; if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) { p = xdr_inline_decode(xdr, 4); - if (unlikely(!p)) { - print_overflow_msg(__func__, xdr); + if (unlikely(!p)) return -EIO; - } *res = be32_to_cpup(p); bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE; } @@ -5016,19 +4870,16 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; memcpy(fh->data, p, len); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -5052,7 +4903,7 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) p = xdr_inline_decode(xdr, 32); /* read 32 bytes */ if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &offset); /* read 2 8-byte long words */ p = xdr_decode_hyper(p, &length); type = be32_to_cpup(p++); /* 4 byte read */ @@ -5069,11 +4920,9 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */ namelen = be32_to_cpup(p); /* read 4 bytes */ /* have read all 32 bytes now */ p = xdr_inline_decode(xdr, namelen); /* variable size field */ - if (likely(p)) - return -NFS4ERR_DENIED; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; + if (likely(!p)) + return -EIO; + return -NFS4ERR_DENIED; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) @@ -5142,7 +4991,7 @@ static int decode_space_limit(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; limit_type = be32_to_cpup(p++); switch (limit_type) { case NFS4_LIMIT_SIZE: @@ -5156,9 +5005,6 @@ static int decode_space_limit(struct xdr_stream *xdr, maxsize >>= PAGE_SHIFT; *pagemod_limit = min_t(u64, maxsize, ULONG_MAX); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_rw_delegation(struct xdr_stream *xdr, @@ -5173,7 +5019,7 @@ static int decode_rw_delegation(struct xdr_stream *xdr, return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->do_recall = be32_to_cpup(p); switch (delegation_type) { @@ -5186,9 +5032,6 @@ static int decode_rw_delegation(struct xdr_stream *xdr, return -EIO; } return decode_ace(xdr, NULL); -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5198,7 +5041,7 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; why_no_delegation = be32_to_cpup(p); switch (why_no_delegation) { case WND4_CONTENTION: @@ -5207,9 +5050,6 @@ static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res) /* Ignore for now */ } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5219,7 +5059,7 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; delegation_type = be32_to_cpup(p); res->delegation_type = 0; switch (delegation_type) { @@ -5232,9 +5072,6 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) return decode_no_delegation(xdr, res); } return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -5256,7 +5093,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->rflags = be32_to_cpup(p++); bmlen = be32_to_cpup(p); if (bmlen > 10) @@ -5264,7 +5101,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) p = xdr_inline_decode(xdr, bmlen << 2); if (unlikely(!p)) - goto out_overflow; + return -EIO; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) res->attrset[i] = be32_to_cpup(p++); @@ -5275,9 +5112,6 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) @@ -5326,7 +5160,7 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, return status; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; eof = be32_to_cpup(p++); count = be32_to_cpup(p); recvd = xdr_read_pages(xdr, count); @@ -5339,9 +5173,6 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, res->eof = eof; res->count = count; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -5374,7 +5205,7 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) /* Convert length of symlink */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); @@ -5395,9 +5226,6 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) */ xdr_terminate_string(rcvbuf, len); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -5500,7 +5328,6 @@ static int decode_setattr(struct xdr_stream *xdr) return status; if (decode_bitmap4(xdr, NULL, 0) >= 0) return 0; - print_overflow_msg(__func__, xdr); return -EIO; } @@ -5512,7 +5339,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" @@ -5523,7 +5350,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re if (nfserr == NFS_OK) { p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->clientid); memcpy(res->confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { @@ -5532,28 +5359,25 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs4_setclientid_re /* skip netid string */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; /* skip uaddr string */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); p = xdr_inline_decode(xdr, len); if (unlikely(!p)) - goto out_overflow; + return -EIO; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -5572,13 +5396,10 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res) p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->count = be32_to_cpup(p++); res->verf->committed = be32_to_cpup(p++); return decode_write_verifier(xdr, &res->verf->verifier); -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -5594,30 +5415,24 @@ static int decode_secinfo_gss(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; oid_len = be32_to_cpup(p); if (oid_len > GSS_OID_MAX_LEN) - goto out_err; + return -EINVAL; p = xdr_inline_decode(xdr, oid_len); if (unlikely(!p)) - goto out_overflow; + return -EIO; memcpy(flavor->flavor_info.oid.data, p, oid_len); flavor->flavor_info.oid.len = oid_len; p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; flavor->flavor_info.qop = be32_to_cpup(p++); flavor->flavor_info.service = be32_to_cpup(p); return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; -out_err: - return -EINVAL; } static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) @@ -5629,7 +5444,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->flavors->num_flavors = 0; num_flavors = be32_to_cpup(p); @@ -5641,7 +5456,7 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; sec_flavor->flavor = be32_to_cpup(p); if (sec_flavor->flavor == RPC_AUTH_GSS) { @@ -5655,9 +5470,6 @@ static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res status = 0; out: return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res) @@ -5711,11 +5523,11 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; xdr_decode_hyper(p, &res->clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p++); @@ -5739,7 +5551,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* server_owner4.so_minor_id */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->server_owner->minor_id); /* server_owner4.so_major_id */ @@ -5759,7 +5571,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* Implementation Id */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; impl_id_count = be32_to_cpup(p++); if (impl_id_count) { @@ -5778,16 +5590,13 @@ static int decode_exchange_id(struct xdr_stream *xdr, /* nii_date */ p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EIO; p = xdr_decode_hyper(p, &res->impl_id->date.seconds); res->impl_id->date.nseconds = be32_to_cpup(p); /* if there's more than one entry, ignore the rest */ } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -5798,7 +5607,7 @@ static int decode_chan_attrs(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 28); if (unlikely(!p)) - goto out_overflow; + return -EIO; val = be32_to_cpup(p++); /* headerpadsz */ if (val) return -EINVAL; /* no support for header padding yet */ @@ -5816,12 +5625,9 @@ static int decode_chan_attrs(struct xdr_stream *xdr, if (nr_attrs == 1) { p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ if (unlikely(!p)) - goto out_overflow; + return -EIO; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) @@ -5844,7 +5650,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, /* dir flags, rdma mode bool */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->dir = be32_to_cpup(p++); if (res->dir == 0 || res->dir > NFS4_CDFS4_BOTH) @@ -5855,9 +5661,6 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr, res->use_conn_in_rdma_mode = true; return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_create_session(struct xdr_stream *xdr, @@ -5875,7 +5678,7 @@ static int decode_create_session(struct xdr_stream *xdr, /* seqid, flags */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->seqid = be32_to_cpup(p++); res->flags = be32_to_cpup(p); @@ -5884,9 +5687,6 @@ static int decode_create_session(struct xdr_stream *xdr, if (!status) status = decode_chan_attrs(xdr, &res->bc_attrs); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -5967,7 +5767,6 @@ out_err: res->sr_status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out_err; #else /* CONFIG_NFS_V4_1 */ @@ -5995,7 +5794,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (status == -ETOOSMALL) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; pdev->mincount = be32_to_cpup(p); dprintk("%s: Min count too small. mincnt = %u\n", __func__, pdev->mincount); @@ -6005,7 +5804,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; type = be32_to_cpup(p++); if (type != pdev->layout_type) { dprintk("%s: layout mismatch req: %u pdev: %u\n", @@ -6019,19 +5818,19 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, */ pdev->mincount = be32_to_cpup(p); if (xdr_read_pages(xdr, pdev->mincount) != pdev->mincount) - goto out_overflow; + return -EIO; /* Parse notification bitmap, verifying that it is zero. */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; len = be32_to_cpup(p); if (len) { uint32_t i; p = xdr_inline_decode(xdr, 4 * len); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { @@ -6043,9 +5842,6 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, } } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, @@ -6115,7 +5911,6 @@ out: res->status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out; } @@ -6131,16 +5926,13 @@ static int decode_layoutreturn(struct xdr_stream *xdr, return status; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->lrs_present = be32_to_cpup(p); if (res->lrs_present) status = decode_layout_stateid(xdr, &res->stateid); else nfs4_stateid_copy(&res->stateid, &invalid_stateid); return status; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_layoutcommit(struct xdr_stream *xdr, @@ -6158,19 +5950,16 @@ static int decode_layoutcommit(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; sizechanged = be32_to_cpup(p); if (sizechanged) { /* throw away new size */ p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) - goto out_overflow; + return -EIO; } return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; } static int decode_test_stateid(struct xdr_stream *xdr, @@ -6186,21 +5975,17 @@ static int decode_test_stateid(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; num_res = be32_to_cpup(p++); if (num_res != 1) - goto out; + return -EIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EIO; res->status = be32_to_cpup(p++); return status; -out_overflow: - print_overflow_msg(__func__, xdr); -out: - return -EIO; } static int decode_free_stateid(struct xdr_stream *xdr, @@ -7570,11 +7355,11 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, uint64_t new_cookie; __be32 *p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; if (*p == xdr_zero) { p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; if (*p == xdr_zero) return -EAGAIN; entry->eof = 1; @@ -7583,13 +7368,13 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; p = xdr_decode_hyper(p, &new_cookie); entry->len = be32_to_cpup(p); p = xdr_inline_decode(xdr, entry->len); if (unlikely(!p)) - goto out_overflow; + return -EAGAIN; entry->name = (const char *) p; /* @@ -7601,14 +7386,14 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->fattr->valid = 0; if (decode_attr_bitmap(xdr, bitmap) < 0) - goto out_overflow; + return -EAGAIN; if (decode_attr_length(xdr, &len, &savep) < 0) - goto out_overflow; + return -EAGAIN; if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, NULL, entry->label, entry->server) < 0) - goto out_overflow; + return -EAGAIN; if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) entry->ino = entry->fattr->mounted_on_fileid; else if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) @@ -7622,10 +7407,6 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, entry->cookie = new_cookie; return 0; - -out_overflow: - print_overflow_msg(__func__, xdr); - return -EAGAIN; } /* @@ -7791,6 +7572,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(COPY, enc_copy, dec_copy), PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), PROC(LOOKUPP, enc_lookupp, dec_lookupp), + PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index b60d5fbd7727..a90b363500c2 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -11,3 +11,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_xdr_status); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index bd60f8d1e181..a0d6910aa03a 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -969,6 +969,91 @@ TRACE_EVENT(nfs_commit_done, ) ); +TRACE_DEFINE_ENUM(NFS_OK); +TRACE_DEFINE_ENUM(NFSERR_PERM); +TRACE_DEFINE_ENUM(NFSERR_NOENT); +TRACE_DEFINE_ENUM(NFSERR_IO); +TRACE_DEFINE_ENUM(NFSERR_NXIO); +TRACE_DEFINE_ENUM(NFSERR_ACCES); +TRACE_DEFINE_ENUM(NFSERR_EXIST); +TRACE_DEFINE_ENUM(NFSERR_XDEV); +TRACE_DEFINE_ENUM(NFSERR_NODEV); +TRACE_DEFINE_ENUM(NFSERR_NOTDIR); +TRACE_DEFINE_ENUM(NFSERR_ISDIR); +TRACE_DEFINE_ENUM(NFSERR_INVAL); +TRACE_DEFINE_ENUM(NFSERR_FBIG); +TRACE_DEFINE_ENUM(NFSERR_NOSPC); +TRACE_DEFINE_ENUM(NFSERR_ROFS); +TRACE_DEFINE_ENUM(NFSERR_MLINK); +TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG); +TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY); +TRACE_DEFINE_ENUM(NFSERR_DQUOT); +TRACE_DEFINE_ENUM(NFSERR_STALE); +TRACE_DEFINE_ENUM(NFSERR_REMOTE); +TRACE_DEFINE_ENUM(NFSERR_WFLUSH); +TRACE_DEFINE_ENUM(NFSERR_BADHANDLE); +TRACE_DEFINE_ENUM(NFSERR_NOT_SYNC); +TRACE_DEFINE_ENUM(NFSERR_BAD_COOKIE); +TRACE_DEFINE_ENUM(NFSERR_NOTSUPP); +TRACE_DEFINE_ENUM(NFSERR_TOOSMALL); +TRACE_DEFINE_ENUM(NFSERR_SERVERFAULT); +TRACE_DEFINE_ENUM(NFSERR_BADTYPE); +TRACE_DEFINE_ENUM(NFSERR_JUKEBOX); + +#define nfs_show_status(x) \ + __print_symbolic(x, \ + { NFS_OK, "OK" }, \ + { NFSERR_PERM, "PERM" }, \ + { NFSERR_NOENT, "NOENT" }, \ + { NFSERR_IO, "IO" }, \ + { NFSERR_NXIO, "NXIO" }, \ + { NFSERR_ACCES, "ACCES" }, \ + { NFSERR_EXIST, "EXIST" }, \ + { NFSERR_XDEV, "XDEV" }, \ + { NFSERR_NODEV, "NODEV" }, \ + { NFSERR_NOTDIR, "NOTDIR" }, \ + { NFSERR_ISDIR, "ISDIR" }, \ + { NFSERR_INVAL, "INVAL" }, \ + { NFSERR_FBIG, "FBIG" }, \ + { NFSERR_NOSPC, "NOSPC" }, \ + { NFSERR_ROFS, "ROFS" }, \ + { NFSERR_MLINK, "MLINK" }, \ + { NFSERR_NAMETOOLONG, "NAMETOOLONG" }, \ + { NFSERR_NOTEMPTY, "NOTEMPTY" }, \ + { NFSERR_DQUOT, "DQUOT" }, \ + { NFSERR_STALE, "STALE" }, \ + { NFSERR_REMOTE, "REMOTE" }, \ + { NFSERR_WFLUSH, "WFLUSH" }, \ + { NFSERR_BADHANDLE, "BADHANDLE" }, \ + { NFSERR_NOT_SYNC, "NOTSYNC" }, \ + { NFSERR_BAD_COOKIE, "BADCOOKIE" }, \ + { NFSERR_NOTSUPP, "NOTSUPP" }, \ + { NFSERR_TOOSMALL, "TOOSMALL" }, \ + { NFSERR_SERVERFAULT, "REMOTEIO" }, \ + { NFSERR_BADTYPE, "BADTYPE" }, \ + { NFSERR_JUKEBOX, "JUKEBOX" }) + +TRACE_EVENT(nfs_xdr_status, + TP_PROTO( + int error + ), + + TP_ARGS(error), + + TP_STRUCT__entry( + __field(int, error) + ), + + TP_fast_assign( + __entry->error = error; + ), + + TP_printk( + "error=%d (%s)", + __entry->error, nfs_show_status(__entry->error) + ) +); + #endif /* _TRACE_NFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e54d899c1848..e9f39fa5964b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -350,7 +350,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, /** * nfs_unlock_request - Unlock request and wake up sleepers. - * @req: + * @req: pointer to request */ void nfs_unlock_request(struct nfs_page *req) { @@ -368,7 +368,7 @@ void nfs_unlock_request(struct nfs_page *req) /** * nfs_unlock_and_release_request - Unlock request and release the nfs_page - * @req: + * @req: pointer to request */ void nfs_unlock_and_release_request(struct nfs_page *req) { @@ -531,7 +531,6 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * nfs_pgio_rpcsetup - Set up arguments for a pageio call * @hdr: The pageio hdr * @count: Number of bytes to read - * @offset: Initial offset * @how: How to commit data (writes only) * @cinfo: Commit information for the call (writes only) */ @@ -634,7 +633,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); /** * nfs_pgio_error - Clean up from a pageio error - * @desc: IO descriptor * @hdr: pageio header */ static void nfs_pgio_error(struct nfs_pgio_header *hdr) @@ -768,8 +766,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, pageused = 0; while (!list_empty(head)) { req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_list_add_request(req, &hdr->pages); + nfs_list_move_request(req, &hdr->pages); if (!last_page || last_page != req->wb_page) { pageused++; @@ -893,6 +890,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1, * nfs_can_coalesce_requests - test two requests for compatibility * @prev: pointer to nfs_page * @req: pointer to nfs_page + * @pgio: pointer to nfs_pagio_descriptor * * The nfs_page structures 'prev' and 'req' are compared to ensure that the * page data area they describe is contiguous, and that their RPC @@ -961,8 +959,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, } if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; - nfs_list_remove_request(req); - nfs_list_add_request(req, &mirror->pg_list); + nfs_list_move_request(req, &mirror->pg_list); mirror->pg_count += req->wb_bytes; return 1; } @@ -988,6 +985,16 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) } } +static void +nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc, + struct nfs_page *req) +{ + LIST_HEAD(head); + + nfs_list_move_request(req, &head); + desc->pg_completion_ops->error_cleanup(&head, desc->pg_error); +} + /** * nfs_pageio_add_request - Attempt to coalesce a request into a page list. * @desc: destination io descriptor @@ -1025,10 +1032,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, nfs_page_group_unlock(req); desc->pg_moreio = 1; nfs_pageio_doio(desc); - if (desc->pg_error < 0) - return 0; - if (mirror->pg_recoalesce) - return 0; + if (desc->pg_error < 0 || mirror->pg_recoalesce) + goto out_cleanup_subreq; /* retry add_request for this subreq */ nfs_page_group_lock(req); continue; @@ -1061,6 +1066,10 @@ err_ptr: desc->pg_error = PTR_ERR(subreq); nfs_page_group_unlock(req); return 0; +out_cleanup_subreq: + if (req != subreq) + nfs_pageio_cleanup_request(desc, subreq); + return 0; } static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) @@ -1079,7 +1088,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) struct nfs_page *req; req = list_first_entry(&head, struct nfs_page, wb_list); - nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; if (desc->pg_error < 0) { @@ -1120,7 +1128,8 @@ static void nfs_pageio_error_cleanup(struct nfs_pageio_descriptor *desc) for (midx = 0; midx < desc->pg_mirror_count; midx++) { mirror = &desc->pg_mirrors[midx]; - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); + desc->pg_completion_ops->error_cleanup(&mirror->pg_list, + desc->pg_error); } } @@ -1168,11 +1177,14 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - goto out_failed; + goto out_cleanup_subreq; } return 1; +out_cleanup_subreq: + if (req != dupreq) + nfs_pageio_cleanup_request(desc, dupreq); out_failed: nfs_pageio_error_cleanup(desc); return 0; @@ -1194,7 +1206,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, desc->pg_mirror_idx = mirror_idx; for (;;) { nfs_pageio_doio(desc); - if (!mirror->pg_recoalesce) + if (desc->pg_error < 0 || !mirror->pg_recoalesce) break; if (!nfs_do_recoalesce(desc)) break; @@ -1222,9 +1234,8 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); - nfs_list_remove_request(req); if (!nfs_pageio_add_request(desc, req)) - nfs_list_add_request(req, &failed); + nfs_list_move_request(req, &failed); } nfs_pageio_complete(desc); if (!list_empty(&failed)) { diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 53726da5c010..7066cd7c7aff 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -758,22 +758,35 @@ static int pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, struct nfs_server *server, struct list_head *layout_list) + __must_hold(&clp->cl_lock) + __must_hold(RCU) { struct pnfs_layout_hdr *lo, *next; struct inode *inode; list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { - if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || + test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) || + !list_empty(&lo->plh_bulk_destroy)) continue; + /* If the sb is being destroyed, just bail */ + if (!nfs_sb_active(server->super)) + break; inode = igrab(lo->plh_inode); - if (inode == NULL) - continue; - list_del_init(&lo->plh_layouts); - if (pnfs_layout_add_bulk_destroy_list(inode, layout_list)) - continue; - rcu_read_unlock(); - spin_unlock(&clp->cl_lock); - iput(inode); + if (inode != NULL) { + list_del_init(&lo->plh_layouts); + if (pnfs_layout_add_bulk_destroy_list(inode, + layout_list)) + continue; + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + iput(inode); + } else { + rcu_read_unlock(); + spin_unlock(&clp->cl_lock); + set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags); + } + nfs_sb_deactive(server->super); spin_lock(&clp->cl_lock); rcu_read_lock(); return -EAGAIN; @@ -811,7 +824,7 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, /* Free all lsegs that are attached to commit buckets */ nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); - iput(inode); + nfs_iput_and_deactive(inode); } return ret; } @@ -1876,7 +1889,7 @@ lookup_again: atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, - atomic_read(&lo->plh_outstanding))); + !atomic_read(&lo->plh_outstanding))); if (IS_ERR(lseg) || !list_empty(&lo->plh_segs)) goto out_put_layout_hdr; pnfs_put_layout_hdr(lo); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 5e80a07b7bea..c0420b979d88 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -104,6 +104,7 @@ enum { NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ + NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */ }; enum layoutdriver_policy_flags { @@ -349,6 +350,7 @@ void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nf void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *, const struct nfs4_deviceid *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); +void nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node); void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 7fb59487ee90..537b80d693f1 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -284,10 +284,22 @@ nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node); void +nfs4_mark_deviceid_available(struct nfs4_deviceid_node *node) +{ + if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { + clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); + } +} +EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_available); + +void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node) { node->timestamp_unavailable = jiffies; + smp_mb__before_atomic(); set_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); } EXPORT_SYMBOL_GPL(nfs4_mark_deviceid_unavailable); @@ -302,6 +314,7 @@ nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node) if (time_in_range(node->timestamp_unavailable, start, end)) return true; clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); + smp_mb__after_atomic(); } return false; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f9f19784db82..1d95a60b2586 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -205,7 +205,7 @@ static void nfs_initiate_read(struct nfs_pgio_header *hdr, } static void -nfs_async_read_error(struct list_head *head) +nfs_async_read_error(struct list_head *head, int error) { struct nfs_page *req; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0570391eaa16..23790c7b2289 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1919,7 +1919,7 @@ static int nfs_parse_devname(const char *dev_name, /* kill possible hostname list: not supported */ comma = strchr(dev_name, ','); if (comma != NULL && comma < end) - *comma = 0; + len = comma - dev_name; } if (len > maxnamlen) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 79b97b3c4427..52d533967485 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -39,6 +39,7 @@ nfs_free_unlinkdata(struct nfs_unlinkdata *data) /** * nfs_async_unlink_done - Sillydelete post-processing * @task: rpc_task of the sillydelete + * @calldata: pointer to nfs_unlinkdata * * Do the directory attribute update. */ @@ -54,7 +55,7 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) /** * nfs_async_unlink_release - Release the sillydelete data. - * @task: rpc_task of the sillydelete + * @calldata: struct nfs_unlinkdata to release * * We need to call nfs_put_unlinkdata as a 'tk_release' task since the * rpc_task would be freed too. @@ -159,8 +160,8 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf /** * nfs_async_unlink - asynchronous unlinking of a file - * @dir: parent directory of dentry - * @dentry: dentry to unlink + * @dentry: parent directory of dentry + * @name: name of dentry to unlink */ static int nfs_async_unlink(struct dentry *dentry, const struct qstr *name) @@ -324,6 +325,7 @@ static const struct rpc_call_ops nfs_rename_ops = { * @new_dir: target directory for the rename * @old_dentry: original dentry to be renamed * @new_dentry: dentry to which the old_dentry should be renamed + * @complete: Function to run on successful completion * * It's expected that valid references to the dentries and inodes are held */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d09c9f878141..f3ebabaa291d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -26,6 +26,7 @@ #include <linux/iversion.h> #include <linux/uaccess.h> +#include <linux/sched/mm.h> #include "delegation.h" #include "internal.h" @@ -712,11 +713,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; - struct nfs_io_completion *ioc = nfs_io_completion_alloc(GFP_NOFS); + struct nfs_io_completion *ioc; + unsigned int pflags = memalloc_nofs_save(); int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); + ioc = nfs_io_completion_alloc(GFP_NOFS); if (ioc) nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); @@ -727,6 +730,8 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) nfs_pageio_complete(&pgio); nfs_io_completion_put(ioc); + memalloc_nofs_restore(pflags); + if (err < 0) goto out_err; err = pgio.pg_error; @@ -865,7 +870,6 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page - * @dst: commit list head * @cinfo: holds list lock and accounting info * * This sets the PG_CLEAN bit, updates the cinfo count of @@ -1412,20 +1416,27 @@ static void nfs_redirty_request(struct nfs_page *req) nfs_release_request(req); } -static void nfs_async_write_error(struct list_head *head) +static void nfs_async_write_error(struct list_head *head, int error) { struct nfs_page *req; while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); + if (nfs_error_is_fatal(error)) { + nfs_context_set_write_error(req->wb_context, error); + if (nfs_error_is_fatal_on_server(error)) { + nfs_write_error_remove_page(req); + continue; + } + } nfs_redirty_request(req); } } static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) { - nfs_async_write_error(&hdr->pages); + nfs_async_write_error(&hdr->pages, 0); filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset, hdr->args.offset + hdr->args.count - 1); } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9eb8086ea841..8f933e84cec1 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -463,8 +463,19 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) &resp->common, nfs3svc_encode_entry); memcpy(resp->verf, argp->verf, 8); resp->count = resp->buffer - argp->buffer; - if (resp->offset) - xdr_encode_hyper(resp->offset, argp->cookie); + if (resp->offset) { + loff_t offset = argp->cookie; + + if (unlikely(resp->offset1)) { + /* we ended up with offset on a page boundary */ + *resp->offset = htonl(offset >> 32); + *resp->offset1 = htonl(offset & 0xffffffff); + resp->offset1 = NULL; + } else { + xdr_encode_hyper(resp->offset, offset); + } + resp->offset = NULL; + } RETURN_STATUS(nfserr); } @@ -533,6 +544,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) } else { xdr_encode_hyper(resp->offset, offset); } + resp->offset = NULL; } RETURN_STATUS(nfserr); @@ -576,7 +588,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) resp->f_wtmax = max_blocksize; resp->f_wtpref = max_blocksize; resp->f_wtmult = PAGE_SIZE; - resp->f_dtpref = PAGE_SIZE; + resp->f_dtpref = max_blocksize; resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 9b973f4f7d01..93fea246f676 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -573,6 +573,8 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; + u32 max_blocksize = svc_max_payload(rqstp); + p = decode_fh(p, &args->fh); if (!p) return 0; @@ -580,7 +582,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); + args->count = min_t(u32, args->count, max_blocksize); args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); @@ -921,6 +923,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, } else { xdr_encode_hyper(cd->offset, offset64); } + cd->offset = NULL; } /* diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c74e4538d0eb..d219159b98af 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -60,16 +60,6 @@ struct nfs4_cb_compound_hdr { int status; }; -/* - * Handle decode buffer overflows out-of-line. - */ -static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) -{ - dprintk("NFS: %s prematurely hit the end of our receive buffer. " - "Remaining buffer length is %tu words.\n", - func, xdr->end - xdr->p); -} - static __be32 *xdr_encode_empty_array(__be32 *p) { *p++ = xdr_zero; @@ -240,7 +230,6 @@ static int decode_cb_op_status(struct xdr_stream *xdr, *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; out_unexpected: dprintk("NFSD: Callback server returned operation %d but " @@ -309,7 +298,6 @@ static int decode_cb_compound4res(struct xdr_stream *xdr, hdr->nops = be32_to_cpup(p); return 0; out_overflow: - print_overflow_msg(__func__, xdr); return -EIO; } @@ -437,7 +425,6 @@ out: cb->cb_seq_status = status; return status; out_overflow: - print_overflow_msg(__func__, xdr); status = -EIO; goto out; } @@ -913,9 +900,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c return PTR_ERR(client); } cred = get_backchannel_cred(clp, client, ses); - if (IS_ERR(cred)) { + if (!cred) { rpc_shutdown_client(client); - return PTR_ERR(cred); + return -ENOMEM; } clp->cl_cb_client = client; clp->cl_cb_cred = cred; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fb3c9844c82a..6a45fb00c5fc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1544,16 +1544,16 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; - int avail; + unsigned long avail, total_avail; spin_lock(&nfsd_drc_lock); - avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, - nfsd_drc_max_mem - nfsd_drc_mem_used); + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* * Never use more than a third of the remaining memory, * unless it's the only way to give this client a slot: */ - avail = clamp_t(int, avail, slotsize, avail/3); + avail = clamp_t(int, avail, slotsize, total_avail/3); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 72a7681f4046..f2feb2d11bae 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1126,7 +1126,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) case 'Y': case 'y': case '1': - if (nn->nfsd_serv) + if (!nn->nfsd_serv) return -EBUSY; nfsd4_end_grace(nn); break; diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index f2129a5d9f23..4391fd3abd8f 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -189,7 +189,7 @@ retry: */ if (!err) return 0; - else if (err != -EEXIST) + else if (err != -EBUSY) goto failed_unlock; err = invalidate_inode_pages2_range(btnc, newkey, newkey); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 56992b32c6bb..a90bb19dcfa2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -208,6 +208,7 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; + unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh; size_t fh_len = event->fh_len; size_t len = fanotify_event_info_len(event); @@ -233,7 +234,16 @@ static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) buf += sizeof(handle); len -= sizeof(handle); - if (copy_to_user(buf, fanotify_event_fh(event), fh_len)) + /* + * For an inline fh, copy through stack to exclude the copy from + * usercopy hardening protections. + */ + fh = fanotify_event_fh(event); + if (fh_len <= FANOTIFY_INLINE_FH_LEN) { + memcpy(bounce, fh, fh_len); + fh = bounce; + } + if (copy_to_user(buf, fh, fh_len)) return -EFAULT; /* Pad with 0's */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index e2901fbb9f76..7b53598c8804 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -519,8 +519,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; - else if (create) - return -EEXIST; + else if (create) { + ret = -EEXIST; + goto out; + } i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); @@ -548,6 +550,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* return the wd */ ret = i_mark->wd; +out: /* match the get from fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index f038235c64bd..c3334eca18c7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -261,11 +261,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, generic_fillattr(inode, stat); /* override block size reported to stat */ - if (request_mask & STATX_SIZE) - stat->result_mask = STATX_BASIC_STATS; - else - stat->result_mask = STATX_BASIC_STATS & - ~STATX_SIZE; + if (!(request_mask & STATX_SIZE)) + stat->result_mask &= ~STATX_SIZE; stat->attributes_mask = STATX_ATTR_IMMUTABLE | STATX_ATTR_APPEND; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 9e62dcf06fc4..68b3303e4b46 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -443,6 +443,24 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { int err; + /* + * Copy up data first and then xattrs. Writing data after + * xattrs will remove security.capability xattr automatically. + */ + if (S_ISREG(c->stat.mode) && !c->metacopy) { + struct path upperpath, datapath; + + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry != NULL)) + return -EIO; + upperpath.dentry = temp; + + ovl_path_lowerdata(c->dentry, &datapath); + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); + if (err) + return err; + } + err = ovl_copy_xattr(c->lowerpath.dentry, temp); if (err) return err; @@ -460,19 +478,6 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) return err; } - if (S_ISREG(c->stat.mode) && !c->metacopy) { - struct path upperpath, datapath; - - ovl_path_upper(c->dentry, &upperpath); - BUG_ON(upperpath.dentry != NULL); - upperpath.dentry = temp; - - ovl_path_lowerdata(c->dentry, &datapath); - err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); - if (err) - return err; - } - if (c->metacopy) { err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, NULL, 0, -EOPNOTSUPP); @@ -737,6 +742,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) { struct path upperpath, datapath; int err; + char *capability = NULL; + ssize_t uninitialized_var(cap_size); ovl_path_upper(c->dentry, &upperpath); if (WARN_ON(upperpath.dentry == NULL)) @@ -746,15 +753,37 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) if (WARN_ON(datapath.dentry == NULL)) return -EIO; + if (c->stat.size) { + err = cap_size = ovl_getxattr(upperpath.dentry, XATTR_NAME_CAPS, + &capability, 0); + if (err < 0 && err != -ENODATA) + goto out; + } + err = ovl_copy_up_data(&datapath, &upperpath, c->stat.size); if (err) - return err; + goto out_free; + + /* + * Writing to upper file will clear security.capability xattr. We + * don't want that to happen for normal copy-up operation. + */ + if (capability) { + err = ovl_do_setxattr(upperpath.dentry, XATTR_NAME_CAPS, + capability, cap_size, 0); + if (err) + goto out_free; + } + err = vfs_removexattr(upperpath.dentry, OVL_XATTR_METACOPY); if (err) - return err; + goto out_free; ovl_set_upperdata(d_inode(c->dentry)); +out_free: + kfree(capability); +out: return err; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 5e45cb3630a0..9c6018287d57 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -277,6 +277,8 @@ int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); int ovl_check_metacopy_xattr(struct dentry *dentry); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct dentry *dentry, int padding); +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding); static inline bool ovl_is_impuredir(struct dentry *dentry) { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7c01327b1852..4035e640f402 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -863,28 +863,49 @@ bool ovl_is_metacopy_dentry(struct dentry *dentry) return (oe->numlower > 1); } -char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value, + size_t padding) { - int res; - char *s, *next, *buf = NULL; + ssize_t res; + char *buf = NULL; - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0); + res = vfs_getxattr(dentry, name, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) - return NULL; + return -ENODATA; goto fail; } - buf = kzalloc(res + padding + 1, GFP_KERNEL); - if (!buf) - return ERR_PTR(-ENOMEM); + if (res != 0) { + buf = kzalloc(res + padding, GFP_KERNEL); + if (!buf) + return -ENOMEM; - if (res == 0) - goto invalid; + res = vfs_getxattr(dentry, name, buf, res); + if (res < 0) + goto fail; + } + *value = buf; + + return res; + +fail: + pr_warn_ratelimited("overlayfs: failed to get xattr %s: err=%zi)\n", + name, res); + kfree(buf); + return res; +} - res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res); +char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) +{ + int res; + char *s, *next, *buf = NULL; + + res = ovl_getxattr(dentry, OVL_XATTR_REDIRECT, &buf, padding + 1); + if (res == -ENODATA) + return NULL; if (res < 0) - goto fail; + return ERR_PTR(res); if (res == 0) goto invalid; @@ -900,15 +921,9 @@ char *ovl_get_redirect_xattr(struct dentry *dentry, int padding) } return buf; - -err_free: - kfree(buf); - return ERR_PTR(res); -fail: - pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res); - goto err_free; invalid: pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); res = -EINVAL; - goto err_free; + kfree(buf); + return ERR_PTR(res); } diff --git a/fs/pipe.c b/fs/pipe.c index 51d5fd8840ab..070aad543382 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -225,8 +225,15 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe, } EXPORT_SYMBOL(generic_pipe_buf_release); +/* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations anon_pipe_buf_ops = { - .can_merge = 1, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = anon_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, @@ -234,13 +241,32 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { }; static const struct pipe_buf_operations packet_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; +/** + * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable + * @buf: the buffer to mark + * + * Description: + * This function ensures that no future writes will be merged into the + * given &struct pipe_buffer. This is necessary when multiple pipe buffers + * share the same backing page. + */ +void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) +{ + if (buf->ops == &anon_pipe_buf_ops) + buf->ops = &anon_pipe_buf_nomerge_ops; +} + +static bool pipe_buf_can_merge(struct pipe_buffer *buf) +{ + return buf->ops == &anon_pipe_buf_ops; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { @@ -378,7 +404,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) struct pipe_buffer *buf = pipe->bufs + lastbuf; int offset = buf->offset + buf->len; - if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) { + if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; diff --git a/fs/pnode.c b/fs/pnode.c index 1100e810d855..7ea6cfb65077 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -214,7 +214,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin) } /* all accesses are serialized by namespace_sem */ -static struct user_namespace *user_ns; static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; @@ -260,9 +259,6 @@ static int propagate_one(struct mount *m) type |= CL_MAKE_SHARED; } - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); @@ -303,7 +299,6 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, * propagate_one(); everything is serialized by namespace_sem, * so globals will do just fine. */ - user_ns = current->nsproxy->mnt_ns->user_ns; last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; diff --git a/fs/pnode.h b/fs/pnode.h index dc87e65becd2..3960a83666cf 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -27,8 +27,7 @@ #define CL_MAKE_SHARED 0x08 #define CL_PRIVATE 0x10 #define CL_SHARED_TO_SLAVE 0x20 -#define CL_UNPRIVILEGED 0x40 -#define CL_COPY_MNT_NS_FILE 0x80 +#define CL_COPY_MNT_NS_FILE 0x40 #define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) diff --git a/fs/proc/base.c b/fs/proc/base.c index 5ab1849971b4..ddef482f1334 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -59,6 +59,7 @@ #include <linux/capability.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/generic-radix-tree.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> @@ -92,7 +93,6 @@ #include <linux/sched/coredump.h> #include <linux/sched/debug.h> #include <linux/sched/stat.h> -#include <linux/flex_array.h> #include <linux/posix-timers.h> #include <trace/events/oom.h> #include "internal.h" @@ -2142,11 +2142,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; - struct flex_array *fa = NULL; - struct map_files_info info; + GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + genradix_init(&fa); + ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) @@ -2178,35 +2179,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) */ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { - if (vma->vm_file && ++pos > ctx->pos) - nr_files++; - } + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; - if (nr_files) { - fa = flex_array_alloc(sizeof(info), nr_files, - GFP_KERNEL); - if (!fa || flex_array_prealloc(fa, 0, nr_files, - GFP_KERNEL)) { + p = genradix_ptr_alloc(&fa, nr_files++, GFP_KERNEL); + if (!p) { ret = -ENOMEM; - if (fa) - flex_array_free(fa); up_read(&mm->mmap_sem); mmput(mm); goto out_put_task; } - for (i = 0, vma = mm->mmap, pos = 2; vma; - vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (++pos <= ctx->pos) - continue; - info.start = vma->vm_start; - info.end = vma->vm_end; - info.mode = vma->vm_file->f_mode; - if (flex_array_put(fa, i++, &info, GFP_KERNEL)) - BUG(); - } + p->start = vma->vm_start; + p->end = vma->vm_end; + p->mode = vma->vm_file->f_mode; } up_read(&mm->mmap_sem); mmput(mm); @@ -2215,7 +2203,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ unsigned int len; - p = flex_array_get(fa, i); + p = genradix_ptr(&fa, i); len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, buf, len, @@ -2225,12 +2213,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) break; ctx->pos++; } - if (fa) - flex_array_free(fa); out_put_task: put_task_struct(task); out: + genradix_free(&fa); return ret; } @@ -2459,11 +2446,10 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry, static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, - const struct pid_entry *ents, - unsigned int nents) + const struct pid_entry *p, + const struct pid_entry *end) { struct task_struct *task = get_proc_task(dir); - const struct pid_entry *p, *last; struct dentry *res = ERR_PTR(-ENOENT); if (!task) @@ -2473,8 +2459,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ - last = &ents[nents]; - for (p = ents; p < last; p++) { + for (; p < end; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) { @@ -2610,7 +2595,7 @@ static struct dentry *proc_##LSM##_attr_dir_lookup(struct inode *dir, \ { \ return proc_pident_lookup(dir, dentry, \ LSM##_attr_dir_stuff, \ - ARRAY_SIZE(LSM##_attr_dir_stuff)); \ + LSM##_attr_dir_stuff + ARRAY_SIZE(LSM##_attr_dir_stuff)); \ } \ \ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \ @@ -2655,7 +2640,8 @@ static struct dentry *proc_attr_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); + attr_dir_stuff, + attr_dir_stuff + ARRAY_SIZE(attr_dir_stuff)); } static const struct inode_operations proc_attr_dir_inode_operations = { @@ -3088,10 +3074,20 @@ static const struct file_operations proc_tgid_base_operations = { .llseek = generic_file_llseek, }; +struct pid *tgid_pidfd_to_pid(const struct file *file) +{ + if (!d_is_dir(file->f_path.dentry) || + (file->f_op != &proc_tgid_base_operations)) + return ERR_PTR(-EBADF); + + return proc_pid(file_inode(file)); +} + static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); + tgid_base_stuff, + tgid_base_stuff + ARRAY_SIZE(tgid_base_stuff)); } static const struct inode_operations proc_tgid_base_inode_operations = { @@ -3463,7 +3459,8 @@ static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return proc_pident_lookup(dir, dentry, - tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + tid_base_stuff, + tid_base_stuff + ARRAY_SIZE(tid_base_stuff)); } static const struct file_operations proc_tid_base_operations = { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index da649ccd6804..fc7e38def174 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -24,7 +24,6 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/mount.h> -#include <linux/magic.h> #include <linux/uaccess.h> @@ -122,13 +121,12 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) return 0; } -static const struct super_operations proc_sops = { +const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, - .remount_fs = proc_remount, .show_options = proc_show_options, }; @@ -488,51 +486,3 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) pde_put(de); return inode; } - -int proc_fill_super(struct super_block *s, void *data, int silent) -{ - struct pid_namespace *ns = get_pid_ns(s->s_fs_info); - struct inode *root_inode; - int ret; - - if (!proc_parse_options(data, ns)) - return -EINVAL; - - /* User space would break if executables or devices appear on proc */ - s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; - s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = PROC_SUPER_MAGIC; - s->s_op = &proc_sops; - s->s_time_gran = 1; - - /* - * procfs isn't actually a stacking filesystem; however, there is - * too much magic going on inside it to permit stacking things on - * top of it - */ - s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; - - /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; - - pde_get(&proc_root); - root_inode = proc_get_inode(s, &proc_root); - if (!root_inode) { - pr_err("proc_fill_super: get root inode failed\n"); - return -ENOMEM; - } - - s->s_root = d_make_root(root_inode); - if (!s->s_root) { - pr_err("proc_fill_super: allocate dentry failed\n"); - return -ENOMEM; - } - - ret = proc_setup_self(s); - if (ret) { - return ret; - } - return proc_setup_thread_self(s); -} diff --git a/fs/proc/internal.h b/fs/proc/internal.h index ea575375f210..d1671e97f7fe 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -207,13 +207,12 @@ struct pde_opener { struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; - extern const struct inode_operations proc_pid_link_inode_operations; +extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); -extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); /* @@ -271,10 +270,8 @@ static inline void proc_tty_init(void) {} * root.c */ extern struct proc_dir_entry proc_root; -extern int proc_parse_options(char *options, struct pid_namespace *pid); extern void proc_self_init(void); -extern int proc_remount(struct super_block *, int *, char *); /* * task_[no]mmu.c diff --git a/fs/proc/root.c b/fs/proc/root.c index 621e6ec322ca..8b145e7b9661 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -19,86 +19,178 @@ #include <linux/module.h> #include <linux/bitops.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <linux/mount.h> #include <linux/pid_namespace.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> #include <linux/cred.h> +#include <linux/magic.h> +#include <linux/slab.h> #include "internal.h" -enum { - Opt_gid, Opt_hidepid, Opt_err, +struct proc_fs_context { + struct pid_namespace *pid_ns; + unsigned int mask; + int hidepid; + int gid; }; -static const match_table_t tokens = { - {Opt_hidepid, "hidepid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_err, NULL}, +enum proc_param { + Opt_gid, + Opt_hidepid, }; -int proc_parse_options(char *options, struct pid_namespace *pid) +static const struct fs_parameter_spec proc_param_specs[] = { + fsparam_u32("gid", Opt_gid), + fsparam_u32("hidepid", Opt_hidepid), + {} +}; + +static const struct fs_parameter_description proc_fs_parameters = { + .name = "proc", + .specs = proc_param_specs, +}; + +static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - args[0].to = args[0].from = NULL; - token = match_token(p, tokens, args); - switch (token) { - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - pid->pid_gid = make_kgid(current_user_ns(), option); - break; - case Opt_hidepid: - if (match_int(&args[0], &option)) - return 0; - if (option < HIDEPID_OFF || - option > HIDEPID_INVISIBLE) { - pr_err("proc: hidepid value must be between 0 and 2.\n"); - return 0; - } - pid->hide_pid = option; - break; - default: - pr_err("proc: unrecognized mount option \"%s\" " - "or missing value\n", p); - return 0; - } + struct proc_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, &proc_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_gid: + ctx->gid = result.uint_32; + break; + + case Opt_hidepid: + ctx->hidepid = result.uint_32; + if (ctx->hidepid < HIDEPID_OFF || + ctx->hidepid > HIDEPID_INVISIBLE) + return invalf(fc, "proc: hidepid value must be between 0 and 2.\n"); + break; + + default: + return -EINVAL; } - return 1; + ctx->mask |= 1 << opt; + return 0; } -int proc_remount(struct super_block *sb, int *flags, char *data) +static void proc_apply_options(struct super_block *s, + struct fs_context *fc, + struct pid_namespace *pid_ns, + struct user_namespace *user_ns) { + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->mask & (1 << Opt_gid)) + pid_ns->pid_gid = make_kgid(user_ns, ctx->gid); + if (ctx->mask & (1 << Opt_hidepid)) + pid_ns->hide_pid = ctx->hidepid; +} + +static int proc_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct pid_namespace *pid_ns = get_pid_ns(s->s_fs_info); + struct inode *root_inode; + int ret; + + proc_apply_options(s, fc, pid_ns, current_user_ns()); + + /* User space would break if executables or devices appear on proc */ + s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; + s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = PROC_SUPER_MAGIC; + s->s_op = &proc_sops; + s->s_time_gran = 1; + + /* + * procfs isn't actually a stacking filesystem; however, there is + * too much magic going on inside it to permit stacking things on + * top of it + */ + s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; + + /* procfs dentries and inodes don't require IO to create */ + s->s_shrink.seeks = 0; + + pde_get(&proc_root); + root_inode = proc_get_inode(s, &proc_root); + if (!root_inode) { + pr_err("proc_fill_super: get root inode failed\n"); + return -ENOMEM; + } + + s->s_root = d_make_root(root_inode); + if (!s->s_root) { + pr_err("proc_fill_super: allocate dentry failed\n"); + return -ENOMEM; + } + + ret = proc_setup_self(s); + if (ret) { + return ret; + } + return proc_setup_thread_self(s); +} + +static int proc_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; struct pid_namespace *pid = sb->s_fs_info; sync_filesystem(sb); - return !proc_parse_options(data, pid); + + proc_apply_options(sb, fc, pid, current_user_ns()); + return 0; } -static struct dentry *proc_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int proc_get_tree(struct fs_context *fc) { - struct pid_namespace *ns; + struct proc_fs_context *ctx = fc->fs_private; - if (flags & SB_KERNMOUNT) { - ns = data; - data = NULL; - } else { - ns = task_active_pid_ns(current); - } + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + fc->s_fs_info = ctx->pid_ns; + return vfs_get_super(fc, vfs_get_keyed_super, proc_fill_super); +} - return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super); +static void proc_fs_context_free(struct fs_context *fc) +{ + struct proc_fs_context *ctx = fc->fs_private; + + if (ctx->pid_ns) + put_pid_ns(ctx->pid_ns); + kfree(ctx); +} + +static const struct fs_context_operations proc_fs_context_ops = { + .free = proc_fs_context_free, + .parse_param = proc_parse_param, + .get_tree = proc_get_tree, + .reconfigure = proc_reconfigure, +}; + +static int proc_init_fs_context(struct fs_context *fc) +{ + struct proc_fs_context *ctx; + + ctx = kzalloc(sizeof(struct proc_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->pid_ns = get_pid_ns(task_active_pid_ns(current)); + fc->fs_private = ctx; + fc->ops = &proc_fs_context_ops; + return 0; } static void proc_kill_sb(struct super_block *sb) @@ -115,10 +207,11 @@ static void proc_kill_sb(struct super_block *sb) } static struct file_system_type proc_fs_type = { - .name = "proc", - .mount = proc_mount, - .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "proc", + .init_fs_context = proc_init_fs_context, + .parameters = &proc_fs_parameters, + .kill_sb = proc_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -156,7 +249,7 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr { if (!proc_pid_lookup(dentry, flags)) return NULL; - + return proc_lookup(dir, dentry, flags); } @@ -209,9 +302,28 @@ struct proc_dir_entry proc_root = { int pid_ns_prepare_proc(struct pid_namespace *ns) { + struct proc_fs_context *ctx; + struct fs_context *fc; struct vfsmount *mnt; - mnt = kern_mount_data(&proc_fs_type, ns); + fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + if (fc->user_ns != ns->user_ns) { + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ns->user_ns); + } + + ctx = fc->fs_private; + if (ctx->pid_ns != ns) { + put_pid_ns(ctx->pid_ns); + get_pid_ns(ns); + ctx->pid_ns = ns; + } + + mnt = fc_mount(fc); + put_fs_context(fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); diff --git a/fs/read_write.c b/fs/read_write.c index 30df848b7451..177ccc3d405a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -478,8 +478,8 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t return ret; } -ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, - loff_t *pos) +static ssize_t __vfs_write(struct file *file, const char __user *p, + size_t count, loff_t *pos) { if (file->f_op->write) return file->f_op->write(file, p, count, pos); @@ -1238,6 +1238,9 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_readv(fd, vec, vlen, flags); + return do_compat_preadv64(fd, vec, vlen, pos, flags); } #endif @@ -1344,6 +1347,9 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct compat_iovec __user *,vec, unsigned long, vlen, loff_t, pos, rwf_t, flags) { + if (pos == -1) + return do_compat_writev(fd, vec, vlen, flags); + return do_compat_pwritev64(fd, vec, vlen, pos, flags); } #endif diff --git a/fs/splice.c b/fs/splice.c index 6489fb9436e4..3ee7e82df48f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -138,7 +138,6 @@ error: } const struct pipe_buf_operations page_cache_pipe_buf_ops = { - .can_merge = 0, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, @@ -156,7 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, } static const struct pipe_buf_operations user_page_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, @@ -326,7 +324,6 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, EXPORT_SYMBOL(generic_file_splice_read); const struct pipe_buf_operations default_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -341,7 +338,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { - .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, @@ -1606,6 +1602,8 @@ retry: */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + obuf->len = len; opipe->nrbufs++; ibuf->offset += obuf->len; @@ -1680,6 +1678,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, */ obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + pipe_buf_mark_unmergeable(obuf); + if (obuf->len > len) obuf->len = len; diff --git a/fs/stat.c b/fs/stat.c index adbfcd86c81b..c38e4c2e1221 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -45,11 +45,6 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) stat->ctime = inode->i_ctime; stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; - - if (IS_NOATIME(inode)) - stat->result_mask &= ~STATX_ATIME; - if (IS_AUTOMOUNT(inode)) - stat->attributes |= STATX_ATTR_AUTOMOUNT; } EXPORT_SYMBOL(generic_fillattr); @@ -75,6 +70,13 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->result_mask |= STATX_BASIC_STATS; request_mask &= STATX_ALL; query_flags &= KSTAT_QUERY_FLAGS; + + /* allow the fs to override these if it really wants to */ + if (IS_NOATIME(inode)) + stat->result_mask &= ~STATX_ATIME; + if (IS_AUTOMOUNT(inode)) + stat->attributes |= STATX_ATTR_AUTOMOUNT; + if (inode->i_op->getattr) return inode->i_op->getattr(path, stat, request_mask, query_flags); diff --git a/fs/super.c b/fs/super.c index 48e25eba8465..583a0124bc39 100644 --- a/fs/super.c +++ b/fs/super.c @@ -35,6 +35,7 @@ #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" @@ -476,6 +477,94 @@ void generic_shutdown_super(struct super_block *sb) EXPORT_SYMBOL(generic_shutdown_super); /** + * sget_fc - Find or create a superblock + * @fc: Filesystem context. + * @test: Comparison callback + * @set: Setup callback + * + * Find or create a superblock using the parameters stored in the filesystem + * context and the two callback functions. + * + * If an extant superblock is matched, then that will be returned with an + * elevated reference count that the caller must transfer or discard. + * + * If no match is made, a new superblock will be allocated and basic + * initialisation will be performed (s_type, s_fs_info and s_id will be set and + * the set() callback will be invoked), the superblock will be published and it + * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE + * as yet unset. + */ +struct super_block *sget_fc(struct fs_context *fc, + int (*test)(struct super_block *, struct fs_context *), + int (*set)(struct super_block *, struct fs_context *)) +{ + struct super_block *s = NULL; + struct super_block *old; + struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; + int err; + + if (!(fc->sb_flags & SB_KERNMOUNT) && + fc->purpose != FS_CONTEXT_FOR_SUBMOUNT) { + /* Don't allow mounting unless the caller has CAP_SYS_ADMIN + * over the namespace. + */ + if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } else { + if (!ns_capable(fc->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + } + } + +retry: + spin_lock(&sb_lock); + if (test) { + hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { + if (test(old, fc)) + goto share_extant_sb; + } + } + if (!s) { + spin_unlock(&sb_lock); + s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); + if (!s) + return ERR_PTR(-ENOMEM); + goto retry; + } + + s->s_fs_info = fc->s_fs_info; + err = set(s, fc); + if (err) { + s->s_fs_info = NULL; + spin_unlock(&sb_lock); + destroy_unused_super(s); + return ERR_PTR(err); + } + fc->s_fs_info = NULL; + s->s_type = fc->fs_type; + strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + list_add_tail(&s->s_list, &super_blocks); + hlist_add_head(&s->s_instances, &s->s_type->fs_supers); + spin_unlock(&sb_lock); + get_filesystem(s->s_type); + register_shrinker_prepared(&s->s_shrink); + return s; + +share_extant_sb: + if (user_ns != old->s_user_ns) { + spin_unlock(&sb_lock); + destroy_unused_super(s); + return ERR_PTR(-EBUSY); + } + if (!grab_super(old)) + goto retry; + destroy_unused_super(s); + return old; +} +EXPORT_SYMBOL(sget_fc); + +/** * sget_userns - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback @@ -835,28 +924,35 @@ rescan: } /** - * do_remount_sb - asks filesystem to change mount options. - * @sb: superblock in question - * @sb_flags: revised superblock flags - * @data: the rest of options - * @force: whether or not to force the change + * reconfigure_super - asks filesystem to change superblock parameters + * @fc: The superblock and configuration * - * Alters the mount options of a mounted file system. + * Alters the configuration parameters of a live superblock. */ -int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) +int reconfigure_super(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; int retval; - int remount_ro; + bool remount_ro = false; + bool force = fc->sb_flags & SB_FORCE; + if (fc->sb_flags_mask & ~MS_RMT_MASK) + return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; + retval = security_sb_remount(sb, fc->security); + if (retval) + return retval; + + if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK - if (!(sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) - return -EACCES; + if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev)) + return -EACCES; #endif - remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); + remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); + } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { @@ -867,13 +963,14 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; - remount_ro = (sb_flags & SB_RDONLY) && !sb_rdonly(sb); + remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); - /* If we are remounting RDONLY and current sb is read/write, - make sure there are no rw files opened */ + /* If we are reconfiguring to RDONLY and current sb is read/write, + * make sure there are no files open for writing. + */ if (remount_ro) { if (force) { sb->s_readonly_remount = 1; @@ -885,8 +982,8 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) } } - if (sb->s_op->remount_fs) { - retval = sb->s_op->remount_fs(sb, &sb_flags, data); + if (fc->ops->reconfigure) { + retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; @@ -895,7 +992,9 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force) sb->s_type->name, retval); } } - sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (sb_flags & MS_RMT_MASK); + + WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | + (fc->sb_flags & fc->sb_flags_mask))); /* Needs to be ordered wrt mnt_is_readonly() */ smp_wmb(); sb->s_readonly_remount = 0; @@ -922,10 +1021,15 @@ static void do_emergency_remount_callback(struct super_block *sb) down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && !sb_rdonly(sb)) { - /* - * What lock protects sb->s_flags?? - */ - do_remount_sb(sb, SB_RDONLY, NULL, 1); + struct fs_context *fc; + + fc = fs_context_for_reconfigure(sb->s_root, + SB_RDONLY | SB_FORCE, SB_RDONLY); + if (!IS_ERR(fc)) { + if (parse_monolithic_mount_data(fc, NULL) == 0) + (void)reconfigure_super(fc); + put_fs_context(fc); + } } up_write(&sb->s_umount); } @@ -1087,6 +1191,89 @@ struct dentry *mount_ns(struct file_system_type *fs_type, EXPORT_SYMBOL(mount_ns); +int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) +{ + return set_anon_super(sb, NULL); +} +EXPORT_SYMBOL(set_anon_super_fc); + +static int test_keyed_super(struct super_block *sb, struct fs_context *fc) +{ + return sb->s_fs_info == fc->s_fs_info; +} + +static int test_single_super(struct super_block *s, struct fs_context *fc) +{ + return 1; +} + +/** + * vfs_get_super - Get a superblock with a search key set in s_fs_info. + * @fc: The filesystem context holding the parameters + * @keying: How to distinguish superblocks + * @fill_super: Helper to initialise a new superblock + * + * Search for a superblock and create a new one if not found. The search + * criterion is controlled by @keying. If the search fails, a new superblock + * is created and @fill_super() is called to initialise it. + * + * @keying can take one of a number of values: + * + * (1) vfs_get_single_super - Only one superblock of this type may exist on the + * system. This is typically used for special system filesystems. + * + * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have + * distinct keys (where the key is in s_fs_info). Searching for the same + * key again will turn up the superblock for that key. + * + * (3) vfs_get_independent_super - Multiple superblocks may exist and are + * unkeyed. Each call will get a new superblock. + * + * A permissions check is made by sget_fc() unless we're getting a superblock + * for a kernel-internal mount or a submount. + */ +int vfs_get_super(struct fs_context *fc, + enum vfs_get_super_keying keying, + int (*fill_super)(struct super_block *sb, + struct fs_context *fc)) +{ + int (*test)(struct super_block *, struct fs_context *); + struct super_block *sb; + + switch (keying) { + case vfs_get_single_super: + test = test_single_super; + break; + case vfs_get_keyed_super: + test = test_keyed_super; + break; + case vfs_get_independent_super: + test = NULL; + break; + default: + BUG(); + } + + sb = sget_fc(fc, test, set_anon_super_fc); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + int err = fill_super(sb, fc); + if (err) { + deactivate_locked_super(sb); + return err; + } + + sb->s_flags |= SB_ACTIVE; + } + + BUG_ON(fc->root); + fc->root = dget(sb->s_root); + return 0; +} +EXPORT_SYMBOL(vfs_get_super); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { @@ -1212,6 +1399,31 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); +static int reconfigure_single(struct super_block *s, + int flags, void *data) +{ + struct fs_context *fc; + int ret; + + /* The caller really need to be passing fc down into mount_single(), + * then a chunk of this can be removed. [Bollocks -- AV] + * Better yet, reconfiguration shouldn't happen, but rather the second + * mount should be rejected if the parameters are not compatible. + */ + fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); + if (IS_ERR(fc)) + return PTR_ERR(fc); + + ret = parse_monolithic_mount_data(fc, data); + if (ret < 0) + goto out; + + ret = reconfigure_super(fc); +out: + put_fs_context(fc); + return ret; +} + static int compare_single(struct super_block *s, void *p) { return 1; @@ -1229,41 +1441,64 @@ struct dentry *mount_single(struct file_system_type *fs_type, return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - s->s_flags |= SB_ACTIVE; + if (!error) + s->s_flags |= SB_ACTIVE; } else { - do_remount_sb(s, flags, data, 0); + error = reconfigure_single(s, flags, data); + } + if (unlikely(error)) { + deactivate_locked_super(s); + return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); -struct dentry * -mount_fs(struct file_system_type *type, int flags, const char *name, void *data) +/** + * vfs_get_tree - Get the mountable root + * @fc: The superblock configuration context. + * + * The filesystem is invoked to get or create a superblock which can then later + * be used for mounting. The filesystem places a pointer to the root to be + * used for mounting in @fc->root. + */ +int vfs_get_tree(struct fs_context *fc) { - struct dentry *root; struct super_block *sb; - int error = -ENOMEM; - void *sec_opts = NULL; + int error; - if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { - error = security_sb_eat_lsm_opts(data, &sec_opts); - if (error) - return ERR_PTR(error); + if (fc->fs_type->fs_flags & FS_REQUIRES_DEV && !fc->source) { + errorf(fc, "Filesystem requires source device"); + return -ENOENT; } - root = type->mount(type, flags, name, data); - if (IS_ERR(root)) { - error = PTR_ERR(root); - goto out_free_secdata; + if (fc->root) + return -EBUSY; + + /* Get the mountable root in fc->root, with a ref on the root and a ref + * on the superblock. + */ + error = fc->ops->get_tree(fc); + if (error < 0) + return error; + + if (!fc->root) { + pr_err("Filesystem %s get_tree() didn't set fc->root\n", + fc->fs_type->name); + /* We don't know what the locking state of the superblock is - + * if there is a superblock. + */ + BUG(); } - sb = root->d_sb; - BUG_ON(!sb); + + sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); + if (fc->subtype && !sb->s_subtype) { + sb->s_subtype = fc->subtype; + fc->subtype = NULL; + } + /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the @@ -1273,14 +1508,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) smp_wmb(); sb->s_flags |= SB_BORN; - error = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL); - if (error) - goto out_sb; - - if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT))) { - error = security_sb_kern_mount(sb); - if (error) - goto out_sb; + error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); + if (unlikely(error)) { + fc_drop_locked(fc); + return error; } /* @@ -1290,18 +1521,11 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " - "negative value (%lld)\n", type->name, sb->s_maxbytes); + "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); - up_write(&sb->s_umount); - security_free_mnt_opts(&sec_opts); - return root; -out_sb: - dput(root); - deactivate_locked_super(sb); -out_free_secdata: - security_free_mnt_opts(&sec_opts); - return ERR_PTR(error); + return 0; } +EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 92682fcc41f6..1b56686ab178 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -13,34 +13,71 @@ #include <linux/magic.h> #include <linux/mount.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/fs_context.h> +#include <net/net_namespace.h> #include "sysfs.h" static struct kernfs_root *sysfs_root; struct kernfs_node *sysfs_root_kn; -static struct dentry *sysfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sysfs_get_tree(struct fs_context *fc) { - struct dentry *root; - void *ns; - bool new_sb = false; + struct kernfs_fs_context *kfc = fc->fs_private; + int ret; - if (!(flags & SB_KERNMOUNT)) { + ret = kernfs_get_tree(fc); + if (ret) + return ret; + + if (kfc->new_sb_created) + fc->root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; + return 0; +} + +static void sysfs_fs_context_free(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + if (kfc->ns_tag) + kobj_ns_drop(KOBJ_NS_TYPE_NET, kfc->ns_tag); + kernfs_free_fs_context(fc); + kfree(kfc); +} + +static const struct fs_context_operations sysfs_fs_context_ops = { + .free = sysfs_fs_context_free, + .get_tree = sysfs_get_tree, +}; + +static int sysfs_init_fs_context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc; + struct net *netns; + + if (!(fc->sb_flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) - return ERR_PTR(-EPERM); + return -EPERM; } - ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); - root = kernfs_mount_ns(fs_type, flags, sysfs_root, - SYSFS_MAGIC, &new_sb, ns); - if (!new_sb) - kobj_ns_drop(KOBJ_NS_TYPE_NET, ns); - else if (!IS_ERR(root)) - root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE; + kfc = kzalloc(sizeof(struct kernfs_fs_context), GFP_KERNEL); + if (!kfc) + return -ENOMEM; - return root; + kfc->ns_tag = netns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); + kfc->root = sysfs_root; + kfc->magic = SYSFS_MAGIC; + fc->fs_private = kfc; + fc->ops = &sysfs_fs_context_ops; + if (netns) { + if (fc->user_ns) + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(netns->user_ns); + } + fc->global = true; + return 0; } static void sysfs_kill_sb(struct super_block *sb) @@ -52,10 +89,10 @@ static void sysfs_kill_sb(struct super_block *sb) } static struct file_system_type sysfs_fs_type = { - .name = "sysfs", - .mount = sysfs_mount, - .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .name = "sysfs", + .init_fs_context = sysfs_init_fs_context, + .kill_sb = sysfs_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 0f9c362a3402..82e4e6a30b04 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -28,6 +28,11 @@ #include <linux/mount.h> #include "ubifs.h" +/* Need to be kept consistent with checked flags in ioctl2ubifs() */ +#define UBIFS_SUPPORTED_IOCTL_FLAGS \ + (FS_COMPR_FL | FS_SYNC_FL | FS_APPEND_FL | \ + FS_IMMUTABLE_FL | FS_DIRSYNC_FL) + /** * ubifs_set_inode_flags - set VFS inode flags. * @inode: VFS inode to set flags for @@ -169,6 +174,9 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; + if (flags & ~UBIFS_SUPPORTED_IOCTL_FLAGS) + return -EOPNOTSUPP; + if (!S_ISDIR(inode->i_mode)) flags &= ~FS_DIRSYNC_FL; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index ae796e10f68b..e7276932e433 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1242,8 +1242,10 @@ set_size: truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); - udf_truncate_extents(inode); + err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); + if (err) + return err; } update_time: inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index b647f0bd150c..63a47f1e1d52 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -199,7 +199,7 @@ static void udf_update_alloc_ext_desc(struct inode *inode, * for making file shorter. For making file longer, udf_extend_file() has to * be used. */ -void udf_truncate_extents(struct inode *inode) +int udf_truncate_extents(struct inode *inode) { struct extent_position epos; struct kernel_lb_addr eloc, neloc = {}; @@ -224,7 +224,7 @@ void udf_truncate_extents(struct inode *inode) if (etype == -1) { /* We should extend the file? */ WARN_ON(byte_offset); - return; + return 0; } epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); @@ -260,6 +260,9 @@ void udf_truncate_extents(struct inode *inode) epos.block = eloc; epos.bh = udf_tread(sb, udf_get_lb_pblock(sb, &eloc, 0)); + /* Error reading indirect block? */ + if (!epos.bh) + return -EIO; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> @@ -283,4 +286,5 @@ void udf_truncate_extents(struct inode *inode) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); + return 0; } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index ee246769dee4..d89ef71887fc 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -235,7 +235,7 @@ extern struct inode *udf_new_inode(struct inode *, umode_t); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); extern void udf_discard_prealloc(struct inode *); -extern void udf_truncate_extents(struct inode *); +extern int udf_truncate_extents(struct inode *); /* balloc.c */ extern void udf_free_blocks(struct super_block *, struct inode *, diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 9a3767818c50..9c2a0a13ed61 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -563,43 +563,40 @@ xfs_dir3_leaf_find_entry( */ int /* error */ xfs_dir2_leaf_addname( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_trans *tp = args->trans; __be16 *bestsp; /* freespace table in leaf */ - int compact; /* need to compact leaves */ - xfs_dir2_data_hdr_t *hdr; /* data block header */ + __be16 *tagp; /* end of data entry */ struct xfs_buf *dbp; /* data block buffer */ - xfs_dir2_data_entry_t *dep; /* data block entry */ - xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_data_unused_t *dup; /* data unused entry */ + struct xfs_buf *lbp; /* leaf's buffer */ + struct xfs_dir2_leaf *leaf; /* leaf structure */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + struct xfs_dir2_data_hdr *hdr; /* data block header */ + struct xfs_dir2_data_entry *dep; /* data block entry */ + struct xfs_dir2_leaf_entry *lep; /* leaf entry table pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir2_data_unused *dup; /* data unused entry */ + struct xfs_dir2_leaf_tail *ltp; /* leaf tail pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + int compact; /* need to compact leaves */ int error; /* error return value */ int grown; /* allocated new data block */ - int highstale; /* index of next stale leaf */ + int highstale = 0; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - struct xfs_buf *lbp; /* leaf's buffer */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ int lfloglow; /* low leaf logging index */ int lfloghigh; /* high leaf logging index */ - int lowstale; /* index of prev stale leaf */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ + int lowstale = 0; /* index of prev stale leaf */ int needbytes; /* leaf block bytes needed */ int needlog; /* need to log data header */ int needscan; /* need to rescan data free */ - __be16 *tagp; /* end of data entry */ - xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ - struct xfs_dir2_data_free *bf; /* bestfree table */ - struct xfs_dir2_leaf_entry *ents; - struct xfs_dir3_icleaf_hdr leafhdr; trace_xfs_dir2_leaf_addname(args); - dp = args->dp; - tp = args->trans; - error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 3b03703c5c3d..16731d2d684b 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -426,24 +426,22 @@ xfs_dir2_leaf_to_node( static int /* error */ xfs_dir2_leafn_add( struct xfs_buf *bp, /* leaf buffer */ - xfs_da_args_t *args, /* operation arguments */ + struct xfs_da_args *args, /* operation arguments */ int index) /* insertion pt for new entry */ { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_inode *dp = args->dp; + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *lep; + struct xfs_dir2_leaf_entry *ents; int compact; /* compacting stale leaves */ - xfs_inode_t *dp; /* incore directory inode */ - int highstale; /* next stale entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + int highstale = 0; /* next stale entry */ int lfloghigh; /* high leaf entry logging */ int lfloglow; /* low leaf entry logging */ - int lowstale; /* previous stale entry */ - struct xfs_dir3_icleaf_hdr leafhdr; - struct xfs_dir2_leaf_entry *ents; + int lowstale = 0; /* previous stale entry */ trace_xfs_dir2_leafn_add(args, index); - dp = args->dp; - leaf = bp->b_addr; dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); ents = dp->d_ops->leaf_ents_p(leaf); |