diff options
Diffstat (limited to 'fs')
219 files changed, 7986 insertions, 3654 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e622f0f10502..0429c8ee58f1 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -210,12 +210,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; - continue; - } - v9ses->debug = option; + } else { + v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG - p9_debug_level = option; + p9_debug_level = option; #endif + } break; case Opt_dfltuid: @@ -231,7 +231,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "uid field, but not a uid?\n"); ret = -EINVAL; - continue; } break; case Opt_dfltgid: @@ -247,7 +246,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "gid field, but not a gid?\n"); ret = -EINVAL; - continue; } break; case Opt_afid: @@ -256,9 +254,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; - continue; + } else { + v9ses->afid = option; } - v9ses->afid = option; break; case Opt_uname: kfree(v9ses->uname); @@ -306,13 +304,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) "problem allocating copy of cache arg\n"); goto free_and_return; } - ret = get_cache_mode(s); - if (ret == -EINVAL) { - kfree(s); - goto free_and_return; - } + r = get_cache_mode(s); + if (r < 0) + ret = r; + else + v9ses->cache = r; - v9ses->cache = ret; kfree(s); break; @@ -341,14 +338,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) pr_info("Unknown access argument %s\n", s); kfree(s); - goto free_and_return; + continue; } v9ses->uid = make_kuid(current_user_ns(), uid); if (!uid_valid(v9ses->uid)) { ret = -EINVAL; pr_info("Uknown uid %s\n", s); - kfree(s); - goto free_and_return; } } diff --git a/fs/Kconfig b/fs/Kconfig index ac4ac908f001..ab2d96d1abee 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -38,6 +38,7 @@ config FS_DAX bool "Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) + select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) select FS_IOMAP select DAX help @@ -108,6 +109,7 @@ source "fs/notify/Kconfig" source "fs/quota/Kconfig" +source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" source "fs/overlayfs/Kconfig" @@ -203,6 +205,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config MEMFD_CREATE + def_bool TMPFS || HUGETLBFS + config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/Makefile b/fs/Makefile index c9375fd2c8c4..2e005525cc19 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ obj-$(CONFIG_QNX6FS_FS) += qnx6/ +obj-$(CONFIG_AUTOFS_FS) += autofs/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 7587fb665ff1..2c46c46f3a6d 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -43,8 +43,7 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, _enter("%u,%u,%u", nr, service, port); - alist = kzalloc(sizeof(*alist) + sizeof(alist->addrs[0]) * nr, - GFP_KERNEL); + alist = kzalloc(struct_size(alist, addrs, nr), GFP_KERNEL); if (!alist) return NULL; diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index e065bc0768e6..1faef56b12bd 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -310,6 +310,10 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) case -ETIME: _debug("no conn"); goto iterate_address; + + case -ECONNRESET: + _debug("call reset"); + goto failed; } restart_from_beginning: @@ -1434,7 +1434,23 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb) req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; - req->ki_hint = file_write_hint(req->ki_filp); + req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); + if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { + /* + * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then + * aio_reqprio is interpreted as an I/O scheduling + * class and priority. + */ + ret = ioprio_check_cap(iocb->aio_reqprio); + if (ret) { + pr_debug("aio ioprio check cap error: %d\n", ret); + return ret; + } + + req->ki_ioprio = iocb->aio_reqprio; + } else + req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); if (unlikely(ret)) fput(req->ki_filp); diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig new file mode 100644 index 000000000000..6a2064eb3b27 --- /dev/null +++ b/fs/autofs/Kconfig @@ -0,0 +1,20 @@ +config AUTOFS_FS + tristate "Kernel automounter support (supports v3, v4 and v5)" + default n + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from + <https://www.kernel.org/pub/linux/daemons/autofs/>; you also want + to answer Y to "NFS file system support", below. + + To compile this support as a module, choose M here: the module will be + called autofs. + + If you are not a part of a fairly large, distributed network or + don't have a laptop which needs to dynamically reconfigure to the + local network, you probably do not need an automounter, and can say + N here. diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile new file mode 100644 index 000000000000..43fedde15c26 --- /dev/null +++ b/fs/autofs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux autofs-filesystem routines. +# + +obj-$(CONFIG_AUTOFS_FS) += autofs.o + +autofs-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs4/autofs_i.h b/fs/autofs/autofs_i.h index 4737615f0eaa..9400a9f6318a 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -9,7 +9,7 @@ /* Internal header file for autofs */ -#include <linux/auto_fs4.h> +#include <linux/auto_fs.h> #include <linux/auto_dev-ioctl.h> #include <linux/kernel.h> @@ -25,7 +25,7 @@ #include <linux/spinlock.h> #include <linux/list.h> #include <linux/completion.h> -#include <asm/current.h> +#include <linux/file.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -122,44 +122,44 @@ struct autofs_sb_info { struct rcu_head rcu; }; -static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) +static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) { return (struct autofs_sb_info *)(sb->s_fs_info); } -static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) +static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) { return (struct autofs_info *)(dentry->d_fsdata); } -/* autofs4_oz_mode(): do we see the man behind the curtain? (The +/* autofs_oz_mode(): do we see the man behind the curtain? (The * processes which do manipulations for us in user space sees the raw * filesystem without "magic".) */ -static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) +static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } -struct inode *autofs4_get_inode(struct super_block *, umode_t); -void autofs4_free_ino(struct autofs_info *); +struct inode *autofs_get_inode(struct super_block *, umode_t); +void autofs_free_ino(struct autofs_info *); /* Expiration */ -int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(const struct path *path, int rcu_walk); -int autofs4_expire_run(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, - struct autofs_packet_expire __user *); -int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when); -int autofs4_expire_multi(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, int __user *); -struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); -struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); +int is_autofs_dentry(struct dentry *); +int autofs_expire_wait(const struct path *path, int rcu_walk); +int autofs_expire_run(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, + struct autofs_packet_expire __user *); +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); +int autofs_expire_multi(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, int __user *); +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); /* Device node initialization */ @@ -168,11 +168,11 @@ void autofs_dev_ioctl_exit(void); /* Operations structures */ -extern const struct inode_operations autofs4_symlink_inode_operations; -extern const struct inode_operations autofs4_dir_inode_operations; -extern const struct file_operations autofs4_dir_operations; -extern const struct file_operations autofs4_root_operations; -extern const struct dentry_operations autofs4_dentry_operations; +extern const struct inode_operations autofs_symlink_inode_operations; +extern const struct inode_operations autofs_dir_inode_operations; +extern const struct file_operations autofs_dir_operations; +extern const struct file_operations autofs_root_operations; +extern const struct dentry_operations autofs_dentry_operations; /* VFS automount flags management functions */ static inline void __managed_dentry_set_managed(struct dentry *dentry) @@ -201,9 +201,9 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry) /* Initializing function */ -int autofs4_fill_super(struct super_block *, void *, int); -struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); -void autofs4_clean_ino(struct autofs_info *); +int autofs_fill_super(struct super_block *, void *, int); +struct autofs_info *autofs_new_ino(struct autofs_sb_info *); +void autofs_clean_ino(struct autofs_info *); static inline int autofs_prepare_pipe(struct file *pipe) { @@ -218,25 +218,25 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *, +int autofs_wait(struct autofs_sb_info *, const struct path *, enum autofs_notify); -int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); -void autofs4_catatonic_mode(struct autofs_sb_info *); +int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); +void autofs_catatonic_mode(struct autofs_sb_info *); -static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) +static inline u32 autofs_get_dev(struct autofs_sb_info *sbi) { return new_encode_dev(sbi->sb->s_dev); } -static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) +static inline u64 autofs_get_ino(struct autofs_sb_info *sbi) { return d_inode(sbi->sb->s_root)->i_ino; } -static inline void __autofs4_add_expiring(struct dentry *dentry) +static inline void __autofs_add_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { if (list_empty(&ino->expiring)) @@ -244,10 +244,10 @@ static inline void __autofs4_add_expiring(struct dentry *dentry) } } -static inline void autofs4_add_expiring(struct dentry *dentry) +static inline void autofs_add_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); @@ -257,10 +257,10 @@ static inline void autofs4_add_expiring(struct dentry *dentry) } } -static inline void autofs4_del_expiring(struct dentry *dentry) +static inline void autofs_del_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); @@ -270,4 +270,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) } } -void autofs4_kill_sb(struct super_block *); +void autofs_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 26f6b4f41ce6..ea4ca1445ab7 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -7,23 +7,10 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/module.h> -#include <linux/vmalloc.h> #include <linux/miscdevice.h> -#include <linux/init.h> -#include <linux/wait.h> -#include <linux/namei.h> -#include <linux/fcntl.h> -#include <linux/file.h> -#include <linux/fdtable.h> -#include <linux/sched.h> -#include <linux/cred.h> #include <linux/compat.h> #include <linux/syscalls.h> #include <linux/magic.h> -#include <linux/dcache.h> -#include <linux/uaccess.h> -#include <linux/slab.h> #include "autofs_i.h" @@ -166,7 +153,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) if (f) { inode = file_inode(f); - sbi = autofs4_sbi(inode->i_sb); + sbi = autofs_sbi(inode->i_sb); } return sbi; } @@ -236,7 +223,7 @@ static int test_by_dev(const struct path *path, void *p) static int test_by_type(const struct path *path, void *p) { - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); return ino && ino->sbi->type & *(unsigned *)p; } @@ -324,7 +311,7 @@ static int autofs_dev_ioctl_ready(struct file *fp, autofs_wqt_t token; token = (autofs_wqt_t) param->ready.token; - return autofs4_wait_release(sbi, token, 0); + return autofs_wait_release(sbi, token, 0); } /* @@ -340,7 +327,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, token = (autofs_wqt_t) param->fail.token; status = param->fail.status < 0 ? param->fail.status : -ENOENT; - return autofs4_wait_release(sbi, token, status); + return autofs_wait_release(sbi, token, status); } /* @@ -412,7 +399,7 @@ static int autofs_dev_ioctl_catatonic(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); return 0; } @@ -459,10 +446,10 @@ static int autofs_dev_ioctl_requester(struct file *fp, if (err) goto out; - ino = autofs4_dentry_ino(path.dentry); + ino = autofs_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(&path, 0); + autofs_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); @@ -489,7 +476,7 @@ static int autofs_dev_ioctl_expire(struct file *fp, how = param->expire.how; mnt = fp->f_path.mnt; - return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how); + return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ @@ -686,7 +673,7 @@ static int _autofs_dev_ioctl(unsigned int command, * Admin needs to be able to set the mount catatonic in * order to be able to perform the re-open. */ - if (!autofs4_oz_mode(sbi) && + if (!autofs_oz_mode(sbi) && cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { err = -EACCES; fput(fp); diff --git a/fs/autofs4/expire.c b/fs/autofs/expire.c index 57725d4a8c59..b332d3f6e730 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs/expire.c @@ -13,10 +13,10 @@ static unsigned long now; /* Check if a dentry can be expired */ -static inline int autofs4_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) +static inline int autofs_can_expire(struct dentry *dentry, + unsigned long timeout, int do_now) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); /* dentry in the process of being deleted */ if (ino == NULL) @@ -31,7 +31,7 @@ static inline int autofs4_can_expire(struct dentry *dentry, } /* Check a mount point for busyness */ -static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) { struct dentry *top = dentry; struct path path = {.mnt = mnt, .dentry = dentry}; @@ -44,8 +44,8 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (!follow_down_one(&path)) goto done; - if (is_autofs4_dentry(path.dentry)) { - struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb); + if (is_autofs_dentry(path.dentry)) { + struct autofs_sb_info *sbi = autofs_sbi(path.dentry->d_sb); /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) @@ -56,7 +56,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; - ino = autofs4_dentry_ino(top); + ino = autofs_dentry_ino(top); ino->last_used = jiffies; goto done; } @@ -74,7 +74,7 @@ done: static struct dentry *get_next_positive_subdir(struct dentry *prev, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct list_head *next; struct dentry *q; @@ -121,7 +121,7 @@ cont: static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct list_head *next; struct dentry *p, *ret; @@ -184,10 +184,10 @@ again: * The tree is not busy iff no mountpoints are busy and there are no * autofs submounts. */ -static int autofs4_direct_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs_direct_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { pr_debug("top %p %pd\n", top, top); @@ -195,14 +195,14 @@ static int autofs4_direct_busy(struct vfsmount *mnt, if (!may_umount_tree(mnt)) { struct autofs_info *ino; - ino = autofs4_dentry_ino(top); + ino = autofs_dentry_ino(top); if (ino) ino->last_used = jiffies; return 1; } /* Timeout of a direct mount is determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, do_now)) return 1; return 0; @@ -212,12 +212,12 @@ static int autofs4_direct_busy(struct vfsmount *mnt, * Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ -static int autofs4_tree_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs_tree_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { - struct autofs_info *top_ino = autofs4_dentry_ino(top); + struct autofs_info *top_ino = autofs_dentry_ino(top); struct dentry *p; pr_debug("top %p %pd\n", top, top); @@ -237,13 +237,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt, * If the fs is busy update the expiry counter. */ if (d_mountpoint(p)) { - if (autofs4_mount_busy(mnt, p)) { + if (autofs_mount_busy(mnt, p)) { top_ino->last_used = jiffies; dput(p); return 1; } } else { - struct autofs_info *ino = autofs4_dentry_ino(p); + struct autofs_info *ino = autofs_dentry_ino(p); unsigned int ino_count = atomic_read(&ino->count); /* allow for dget above and top is already dgot */ @@ -261,16 +261,16 @@ static int autofs4_tree_busy(struct vfsmount *mnt, } /* Timeout of a tree mount is ultimately determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, do_now)) return 1; return 0; } -static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, - struct dentry *parent, - unsigned long timeout, - int do_now) +static struct dentry *autofs_check_leaves(struct vfsmount *mnt, + struct dentry *parent, + unsigned long timeout, + int do_now) { struct dentry *p; @@ -282,11 +282,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, if (d_mountpoint(p)) { /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, p)) + if (autofs_mount_busy(mnt, p)) continue; /* Can we expire this guy */ - if (autofs4_can_expire(p, timeout, do_now)) + if (autofs_can_expire(p, timeout, do_now)) return p; } } @@ -294,10 +294,10 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, } /* Check if we can expire a direct mount (possibly a tree) */ -struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = dget(sb->s_root); @@ -310,9 +310,9 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(root); + ino = autofs_dentry_ino(root); /* No point expiring a pending mount */ if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); @@ -321,7 +321,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); @@ -350,7 +350,7 @@ static struct dentry *should_expire(struct dentry *dentry, { int do_now = how & AUTOFS_EXP_IMMEDIATE; int exp_leaves = how & AUTOFS_EXP_LEAVES; - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); unsigned int ino_count; /* No point expiring a pending mount */ @@ -367,11 +367,11 @@ static struct dentry *should_expire(struct dentry *dentry, pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, dentry)) + if (autofs_mount_busy(mnt, dentry)) return NULL; /* Can we expire this guy */ - if (autofs4_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, do_now)) return dentry; return NULL; } @@ -382,7 +382,7 @@ static struct dentry *should_expire(struct dentry *dentry, * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. */ - if (autofs4_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, do_now)) return dentry; return NULL; } @@ -397,7 +397,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) + if (!autofs_tree_busy(mnt, dentry, timeout, do_now)) return dentry; /* * Case 3: pseudo direct mount, expire individual leaves @@ -411,7 +411,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); + expired = autofs_check_leaves(mnt, dentry, timeout, do_now); if (expired) { if (expired == dentry) dput(dentry); @@ -427,10 +427,10 @@ static struct dentry *should_expire(struct dentry *dentry, * - it is unused by any user process * - it has been unused for exp_timeout time */ -struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -450,7 +450,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, int flags = how; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; @@ -462,7 +462,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, continue; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(expired); + ino = autofs_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); @@ -498,11 +498,11 @@ found: return expired; } -int autofs4_expire_wait(const struct path *path, int rcu_walk) +int autofs_expire_wait(const struct path *path, int rcu_walk) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; int state; @@ -529,7 +529,7 @@ retry: pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - status = autofs4_wait(sbi, path, NFY_NONE); + status = autofs_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); @@ -545,10 +545,10 @@ retry: } /* Perform an expiry operation */ -int autofs4_expire_run(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - struct autofs_packet_expire __user *pkt_p) +int autofs_expire_run(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; struct autofs_info *ino; @@ -560,7 +560,7 @@ int autofs4_expire_run(struct super_block *sb, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = autofs_ptype_expire; - dentry = autofs4_expire_indirect(sb, mnt, sbi, 0); + dentry = autofs_expire_indirect(sb, mnt, sbi, 0); if (!dentry) return -EAGAIN; @@ -573,7 +573,7 @@ int autofs4_expire_run(struct super_block *sb, ret = -EFAULT; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); @@ -583,25 +583,25 @@ int autofs4_expire_run(struct super_block *sb, return ret; } -int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when) +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) { struct dentry *dentry; int ret = -EAGAIN; if (autofs_type_trigger(sbi->type)) - dentry = autofs4_expire_direct(sb, mnt, sbi, when); + dentry = autofs_expire_direct(sb, mnt, sbi, when); else - dentry = autofs4_expire_indirect(sb, mnt, sbi, when); + dentry = autofs_expire_indirect(sb, mnt, sbi, when); if (dentry) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ - ret = autofs4_wait(sbi, &path, NFY_EXPIRE); + ret = autofs_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ @@ -619,7 +619,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more to be done. */ -int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, +int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { int do_now = 0; @@ -627,6 +627,5 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - return autofs4_do_expire_multi(sb, mnt, sbi, do_now); + return autofs_do_expire_multi(sb, mnt, sbi, do_now); } - diff --git a/fs/autofs4/init.c b/fs/autofs/init.c index 8cf0e63389ae..16fb61315843 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs/init.c @@ -13,18 +13,18 @@ static struct dentry *autofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_nodev(fs_type, flags, data, autofs4_fill_super); + return mount_nodev(fs_type, flags, data, autofs_fill_super); } static struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", .mount = autofs_mount, - .kill_sb = autofs4_kill_sb, + .kill_sb = autofs_kill_sb, }; MODULE_ALIAS_FS("autofs"); -static int __init init_autofs4_fs(void) +static int __init init_autofs_fs(void) { int err; @@ -37,12 +37,12 @@ static int __init init_autofs4_fs(void) return err; } -static void __exit exit_autofs4_fs(void) +static void __exit exit_autofs_fs(void) { autofs_dev_ioctl_exit(); unregister_filesystem(&autofs_fs_type); } -module_init(init_autofs4_fs) -module_exit(exit_autofs4_fs) +module_init(init_autofs_fs) +module_exit(exit_autofs_fs) MODULE_LICENSE("GPL"); diff --git a/fs/autofs4/inode.c b/fs/autofs/inode.c index 09e7d68dff02..b51980fc274e 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs/inode.c @@ -7,18 +7,14 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/file.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/parser.h> -#include <linux/bitops.h> #include <linux/magic.h> + #include "autofs_i.h" -#include <linux/module.h> -struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) +struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { struct autofs_info *ino; @@ -32,21 +28,21 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) return ino; } -void autofs4_clean_ino(struct autofs_info *ino) +void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; ino->last_used = jiffies; } -void autofs4_free_ino(struct autofs_info *ino) +void autofs_free_ino(struct autofs_info *ino) { kfree(ino); } -void autofs4_kill_sb(struct super_block *sb) +void autofs_kill_sb(struct super_block *sb) { - struct autofs_sb_info *sbi = autofs4_sbi(sb); + struct autofs_sb_info *sbi = autofs_sbi(sb); /* * In the event of a failure in get_sb_nodev the superblock @@ -56,7 +52,7 @@ void autofs4_kill_sb(struct super_block *sb) */ if (sbi) { /* Free wait queues, close pipe */ - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); put_pid(sbi->oz_pgrp); } @@ -66,9 +62,9 @@ void autofs4_kill_sb(struct super_block *sb) kfree_rcu(sbi, rcu); } -static int autofs4_show_options(struct seq_file *m, struct dentry *root) +static int autofs_show_options(struct seq_file *m, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) @@ -101,16 +97,16 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void autofs4_evict_inode(struct inode *inode) +static void autofs_evict_inode(struct inode *inode) { clear_inode(inode); kfree(inode->i_private); } -static const struct super_operations autofs4_sops = { +static const struct super_operations autofs_sops = { .statfs = simple_statfs, - .show_options = autofs4_show_options, - .evict_inode = autofs4_evict_inode, + .show_options = autofs_show_options, + .evict_inode = autofs_evict_inode, }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, @@ -206,7 +202,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, return (*pipefd < 0); } -int autofs4_fill_super(struct super_block *s, void *data, int silent) +int autofs_fill_super(struct super_block *s, void *data, int silent) { struct inode *root_inode; struct dentry *root; @@ -246,19 +242,19 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; - s->s_op = &autofs4_sops; - s->s_d_op = &autofs4_dentry_operations; + s->s_op = &autofs_sops; + s->s_d_op = &autofs_dentry_operations; s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ - ino = autofs4_new_ino(sbi); + ino = autofs_new_ino(sbi); if (!ino) { ret = -ENOMEM; goto fail_free; } - root_inode = autofs4_get_inode(s, S_IFDIR | 0755); + root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); if (!root) goto fail_ino; @@ -305,8 +301,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); - root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = &autofs4_dir_inode_operations; + root_inode->i_fop = &autofs_root_operations; + root_inode->i_op = &autofs_dir_inode_operations; pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); @@ -340,14 +336,14 @@ fail_dput: dput(root); goto fail_free; fail_ino: - autofs4_free_ino(ino); + autofs_free_ino(ino); fail_free: kfree(sbi); s->s_fs_info = NULL; return ret; } -struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) +struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); @@ -364,10 +360,10 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) if (S_ISDIR(mode)) { set_nlink(inode, 2); - inode->i_op = &autofs4_dir_inode_operations; - inode->i_fop = &autofs4_dir_operations; + inode->i_op = &autofs_dir_inode_operations; + inode->i_fop = &autofs_dir_operations; } else if (S_ISLNK(mode)) { - inode->i_op = &autofs4_symlink_inode_operations; + inode->i_op = &autofs_symlink_inode_operations; } else WARN_ON(1); diff --git a/fs/autofs4/root.c b/fs/autofs/root.c index b12e37f27530..a3d414150578 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs/root.c @@ -9,72 +9,66 @@ */ #include <linux/capability.h> -#include <linux/errno.h> -#include <linux/stat.h> -#include <linux/slab.h> -#include <linux/param.h> -#include <linux/time.h> #include <linux/compat.h> -#include <linux/mutex.h> #include "autofs_i.h" -static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *); -static int autofs4_dir_unlink(struct inode *, struct dentry *); -static int autofs4_dir_rmdir(struct inode *, struct dentry *); -static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t); -static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long); +static int autofs_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs_dir_unlink(struct inode *, struct dentry *); +static int autofs_dir_rmdir(struct inode *, struct dentry *); +static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t); +static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *, - unsigned int, unsigned long); +static long autofs_root_compat_ioctl(struct file *, + unsigned int, unsigned long); #endif -static int autofs4_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs4_lookup(struct inode *, - struct dentry *, unsigned int); -static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(const struct path *, bool); -static void autofs4_dentry_release(struct dentry *); - -const struct file_operations autofs4_root_operations = { +static int autofs_dir_open(struct inode *inode, struct file *file); +static struct dentry *autofs_lookup(struct inode *, + struct dentry *, unsigned int); +static struct vfsmount *autofs_d_automount(struct path *); +static int autofs_d_manage(const struct path *, bool); +static void autofs_dentry_release(struct dentry *); + +const struct file_operations autofs_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, - .unlocked_ioctl = autofs4_root_ioctl, + .unlocked_ioctl = autofs_root_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = autofs4_root_compat_ioctl, + .compat_ioctl = autofs_root_compat_ioctl, #endif }; -const struct file_operations autofs4_dir_operations = { - .open = autofs4_dir_open, +const struct file_operations autofs_dir_operations = { + .open = autofs_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, }; -const struct inode_operations autofs4_dir_inode_operations = { - .lookup = autofs4_lookup, - .unlink = autofs4_dir_unlink, - .symlink = autofs4_dir_symlink, - .mkdir = autofs4_dir_mkdir, - .rmdir = autofs4_dir_rmdir, +const struct inode_operations autofs_dir_inode_operations = { + .lookup = autofs_lookup, + .unlink = autofs_dir_unlink, + .symlink = autofs_dir_symlink, + .mkdir = autofs_dir_mkdir, + .rmdir = autofs_dir_rmdir, }; -const struct dentry_operations autofs4_dentry_operations = { - .d_automount = autofs4_d_automount, - .d_manage = autofs4_d_manage, - .d_release = autofs4_dentry_release, +const struct dentry_operations autofs_dentry_operations = { + .d_automount = autofs_d_automount, + .d_manage = autofs_d_manage, + .d_release = autofs_dentry_release, }; -static void autofs4_add_active(struct dentry *dentry) +static void autofs_add_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); if (!ino->active_count) { @@ -86,12 +80,12 @@ static void autofs4_add_active(struct dentry *dentry) } } -static void autofs4_del_active(struct dentry *dentry) +static void autofs_del_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); ino->active_count--; @@ -103,14 +97,14 @@ static void autofs4_del_active(struct dentry *dentry) } } -static int autofs4_dir_open(struct inode *inode, struct file *file) +static int autofs_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); - if (autofs4_oz_mode(sbi)) + if (autofs_oz_mode(sbi)) goto out; /* @@ -133,10 +127,10 @@ out: return dcache_dir_open(inode, file); } -static void autofs4_dentry_release(struct dentry *de) +static void autofs_dentry_release(struct dentry *de) { - struct autofs_info *ino = autofs4_dentry_ino(de); - struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); + struct autofs_info *ino = autofs_dentry_ino(de); + struct autofs_sb_info *sbi = autofs_sbi(de->d_sb); pr_debug("releasing %p\n", de); @@ -152,12 +146,12 @@ static void autofs4_dentry_release(struct dentry *de) spin_unlock(&sbi->lookup_lock); } - autofs4_free_ino(ino); + autofs_free_ino(ino); } -static struct dentry *autofs4_lookup_active(struct dentry *dentry) +static struct dentry *autofs_lookup_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; @@ -209,10 +203,10 @@ next: return NULL; } -static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, - bool rcu_walk) +static struct dentry *autofs_lookup_expiring(struct dentry *dentry, + bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; @@ -269,17 +263,17 @@ next: return NULL; } -static int autofs4_mount_wait(const struct path *path, bool rcu_walk) +static int autofs_mount_wait(const struct path *path, bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + struct autofs_sb_info *sbi = autofs_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; pr_debug("waiting for mount name=%pd\n", path->dentry); - status = autofs4_wait(sbi, path, NFY_MOUNT); + status = autofs_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; @@ -291,11 +285,11 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) struct dentry *dentry = path->dentry; struct dentry *expiring; - expiring = autofs4_lookup_expiring(dentry, rcu_walk); + expiring = autofs_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(path, rcu_walk); + return autofs_expire_wait(path, rcu_walk); else { const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* @@ -303,17 +297,17 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(&this, 0); - autofs4_del_expiring(expiring); + autofs_expire_wait(&this, 0); + autofs_del_expiring(expiring); dput(expiring); } return 0; } -static struct dentry *autofs4_mountpoint_changed(struct path *path) +static struct dentry *autofs_mountpoint_changed(struct path *path) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); /* * If this is an indirect mount the dentry could have gone away @@ -327,7 +321,7 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - ino = autofs4_dentry_ino(new); + ino = autofs_dentry_ino(new); ino->last_used = jiffies; dput(path->dentry); path->dentry = new; @@ -335,17 +329,17 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) return path->dentry; } -static struct vfsmount *autofs4_d_automount(struct path *path) +static struct vfsmount *autofs_d_automount(struct path *path) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never triggers a mount. */ - if (autofs4_oz_mode(sbi)) + if (autofs_oz_mode(sbi)) return NULL; /* @@ -364,7 +358,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(path, 0); + status = autofs_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; @@ -405,7 +399,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(path, 0); + status = autofs_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -416,24 +410,24 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_unlock(&sbi->fs_lock); done: /* Mount succeeded, check if we ended up with a new dentry */ - dentry = autofs4_mountpoint_changed(path); + dentry = autofs_mountpoint_changed(path); if (!dentry) return ERR_PTR(-ENOENT); return NULL; } -static int autofs4_d_manage(const struct path *path, bool rcu_walk) +static int autofs_d_manage(const struct path *path, bool rcu_walk) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never waits. */ - if (autofs4_oz_mode(sbi)) { + if (autofs_oz_mode(sbi)) { if (!path_is_mountpoint(path)) return -EISDIR; return 0; @@ -447,7 +441,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(path, rcu_walk); + status = autofs_mount_wait(path, rcu_walk); if (status) return status; @@ -500,8 +494,8 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) } /* Lookups in the root directory */ -static struct dentry *autofs4_lookup(struct inode *dir, - struct dentry *dentry, unsigned int flags) +static struct dentry *autofs_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; @@ -513,13 +507,13 @@ static struct dentry *autofs4_lookup(struct inode *dir, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - sbi = autofs4_sbi(dir->i_sb); + sbi = autofs_sbi(dir->i_sb); pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", current->pid, task_pgrp_nr(current), sbi->catatonic, - autofs4_oz_mode(sbi)); + autofs_oz_mode(sbi)); - active = autofs4_lookup_active(dentry); + active = autofs_lookup_active(dentry); if (active) return active; else { @@ -529,7 +523,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, * can return fail immediately. The daemon however does need * to create directories within the file system. */ - if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) return ERR_PTR(-ENOENT); /* Mark entries in the root as mount triggers */ @@ -537,24 +531,24 @@ static struct dentry *autofs4_lookup(struct inode *dir, autofs_type_indirect(sbi->type)) __managed_dentry_set_managed(dentry); - ino = autofs4_new_ino(sbi); + ino = autofs_new_ino(sbi); if (!ino) return ERR_PTR(-ENOMEM); dentry->d_fsdata = ino; ino->dentry = dentry; - autofs4_add_active(dentry); + autofs_add_active(dentry); } return NULL; } -static int autofs4_dir_symlink(struct inode *dir, +static int autofs_dir_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; size_t size = strlen(symname); @@ -562,14 +556,14 @@ static int autofs4_dir_symlink(struct inode *dir, pr_debug("%s <- %pd\n", symname, dentry); - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; BUG_ON(!ino); - autofs4_clean_ino(ino); + autofs_clean_ino(ino); - autofs4_del_active(dentry); + autofs_del_active(dentry); cp = kmalloc(size + 1, GFP_KERNEL); if (!cp) @@ -577,7 +571,7 @@ static int autofs4_dir_symlink(struct inode *dir, strcpy(cp, symname); - inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); + inode = autofs_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); return -ENOMEM; @@ -588,7 +582,7 @@ static int autofs4_dir_symlink(struct inode *dir, dget(dentry); atomic_inc(&ino->count); - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); @@ -610,20 +604,20 @@ static int autofs4_dir_symlink(struct inode *dir, * If a process is blocked on the dentry waiting for the expire to finish, * it will invalidate the dentry and try to mount with a new one. * - * Also see autofs4_dir_rmdir().. + * Also see autofs_dir_rmdir().. */ -static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) +static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_dec(&p_ino->count); } @@ -635,7 +629,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = current_time(dir); spin_lock(&sbi->lookup_lock); - __autofs4_add_expiring(dentry); + __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -692,15 +686,15 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) managed_dentry_set_managed(parent); } -static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) +static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; pr_debug("dentry %p, removing %pd\n", dentry, dentry); - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; spin_lock(&sbi->lookup_lock); @@ -708,7 +702,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&sbi->lookup_lock); return -ENOTEMPTY; } - __autofs4_add_expiring(dentry); + __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -716,7 +710,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) autofs_clear_leaf_automount_flags(dentry); if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) atomic_dec(&p_ino->count); } @@ -730,26 +724,26 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, - struct dentry *dentry, umode_t mode) +static int autofs_dir_mkdir(struct inode *dir, + struct dentry *dentry, umode_t mode) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); - autofs4_clean_ino(ino); + autofs_clean_ino(ino); - autofs4_del_active(dentry); + autofs_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); + inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); @@ -759,7 +753,7 @@ static int autofs4_dir_mkdir(struct inode *dir, dget(dentry); atomic_inc(&ino->count); - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); inc_nlink(dir); @@ -770,7 +764,7 @@ static int autofs4_dir_mkdir(struct inode *dir, /* Get/set timeout ioctl() operation */ #ifdef CONFIG_COMPAT -static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, +static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, compat_ulong_t __user *p) { unsigned long ntimeout; @@ -795,7 +789,7 @@ error: } #endif -static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, +static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, unsigned long __user *p) { unsigned long ntimeout; @@ -820,14 +814,14 @@ error: } /* Return protocol version */ -static inline int autofs4_get_protover(struct autofs_sb_info *sbi, +static inline int autofs_get_protover(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->version, p); } /* Return protocol sub version */ -static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, +static inline int autofs_get_protosubver(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->sub_version, p); @@ -836,7 +830,7 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, /* * Tells the daemon whether it can umount the autofs mount. */ -static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) +static inline int autofs_ask_umount(struct vfsmount *mnt, int __user *p) { int status = 0; @@ -850,14 +844,14 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) return status; } -/* Identify autofs4_dentries - this is so we can tell if there's +/* Identify autofs_dentries - this is so we can tell if there's * an extra dentry refcount or not. We only hold a refcount on the * dentry if its non-negative (ie, d_inode != NULL) */ -int is_autofs4_dentry(struct dentry *dentry) +int is_autofs_dentry(struct dentry *dentry) { return dentry && d_really_is_positive(dentry) && - dentry->d_op == &autofs4_dentry_operations && + dentry->d_op == &autofs_dentry_operations && dentry->d_fsdata != NULL; } @@ -865,10 +859,10 @@ int is_autofs4_dentry(struct dentry *dentry) * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ -static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, +static int autofs_root_ioctl_unlocked(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); + struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); void __user *p = (void __user *)arg; pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", @@ -878,64 +872,63 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ - return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0); + return autofs_wait_release(sbi, (autofs_wqt_t) arg, 0); case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ - return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); + return autofs_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); return 0; case AUTOFS_IOC_PROTOVER: /* Get protocol version */ - return autofs4_get_protover(sbi, p); + return autofs_get_protover(sbi, p); case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ - return autofs4_get_protosubver(sbi, p); + return autofs_get_protosubver(sbi, p); case AUTOFS_IOC_SETTIMEOUT: - return autofs4_get_set_timeout(sbi, p); + return autofs_get_set_timeout(sbi, p); #ifdef CONFIG_COMPAT case AUTOFS_IOC_SETTIMEOUT32: - return autofs4_compat_get_set_timeout(sbi, p); + return autofs_compat_get_set_timeout(sbi, p); #endif case AUTOFS_IOC_ASKUMOUNT: - return autofs4_ask_umount(filp->f_path.mnt, p); + return autofs_ask_umount(filp->f_path.mnt, p); /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: - return autofs4_expire_run(inode->i_sb, - filp->f_path.mnt, sbi, p); + return autofs_expire_run(inode->i_sb, filp->f_path.mnt, sbi, p); /* same as above, but can send multiple expires through pipe */ case AUTOFS_IOC_EXPIRE_MULTI: - return autofs4_expire_multi(inode->i_sb, - filp->f_path.mnt, sbi, p); + return autofs_expire_multi(inode->i_sb, + filp->f_path.mnt, sbi, p); default: return -EINVAL; } } -static long autofs4_root_ioctl(struct file *filp, +static long autofs_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + return autofs_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *filp, +static long autofs_root_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); int ret; if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, arg); else - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, (unsigned long) compat_ptr(arg)); return ret; diff --git a/fs/autofs4/symlink.c b/fs/autofs/symlink.c index ab0b4285a202..aad3902c0cc1 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs/symlink.c @@ -8,22 +8,22 @@ #include "autofs_i.h" -static const char *autofs4_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) +static const char *autofs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct autofs_sb_info *sbi; struct autofs_info *ino; if (!dentry) return ERR_PTR(-ECHILD); - sbi = autofs4_sbi(dentry->d_sb); - ino = autofs4_dentry_ino(dentry); - if (ino && !autofs4_oz_mode(sbi)) + sbi = autofs_sbi(dentry->d_sb); + ino = autofs_dentry_ino(dentry); + if (ino && !autofs_oz_mode(sbi)) ino->last_used = jiffies; return d_inode(dentry)->i_private; } -const struct inode_operations autofs4_symlink_inode_operations = { - .get_link = autofs4_get_link +const struct inode_operations autofs_symlink_inode_operations = { + .get_link = autofs_get_link }; diff --git a/fs/autofs4/waitq.c b/fs/autofs/waitq.c index be9c3dc048ab..f6385c6ef0a5 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs/waitq.c @@ -7,19 +7,15 @@ * option, any later version, incorporated herein by reference. */ -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/signal.h> #include <linux/sched/signal.h> -#include <linux/file.h> #include "autofs_i.h" /* We make this a static variable rather than a part of the superblock; it * is better if we don't reassign numbers easily even across filesystems */ -static autofs_wqt_t autofs4_next_wait_queue = 1; +static autofs_wqt_t autofs_next_wait_queue = 1; -void autofs4_catatonic_mode(struct autofs_sb_info *sbi) +void autofs_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -49,8 +45,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) mutex_unlock(&sbi->wq_mutex); } -static int autofs4_write(struct autofs_sb_info *sbi, - struct file *file, const void *addr, int bytes) +static int autofs_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; const char *data = (const char *)addr; @@ -82,7 +78,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } -static void autofs4_notify_daemon(struct autofs_sb_info *sbi, +static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) { @@ -167,23 +163,23 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) { case 0: break; case -ENOMEM: case -ERESTARTSYS: /* Just fail this one */ - autofs4_wait_release(sbi, wq->wait_queue_token, ret); + autofs_wait_release(sbi, wq->wait_queue_token, ret); break; default: - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); break; } fput(pipe); } -static int autofs4_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char **name) +static int autofs_getpath(struct autofs_sb_info *sbi, + struct dentry *dentry, char *name) { struct dentry *root = sbi->sb->s_root; struct dentry *tmp; @@ -193,7 +189,7 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, unsigned seq; rename_retry: - buf = *name; + buf = name; len = 0; seq = read_seqbegin(&rename_lock); @@ -228,7 +224,7 @@ rename_retry: } static struct autofs_wait_queue * -autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) +autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { struct autofs_wait_queue *wq; @@ -263,7 +259,7 @@ static int validate_request(struct autofs_wait_queue **wait, return -ENOENT; /* Wait in progress, continue; */ - wq = autofs4_find_wait(sbi, qstr); + wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; @@ -272,7 +268,7 @@ static int validate_request(struct autofs_wait_queue **wait, *wait = NULL; /* If we don't yet have any info this is a new request */ - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (!ino) return 1; @@ -297,7 +293,7 @@ static int validate_request(struct autofs_wait_queue **wait, if (sbi->catatonic) return -ENOENT; - wq = autofs4_find_wait(sbi, qstr); + wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; @@ -351,7 +347,7 @@ static int validate_request(struct autofs_wait_queue **wait, return 1; } -int autofs4_wait(struct autofs_sb_info *sbi, +int autofs_wait(struct autofs_sb_info *sbi, const struct path *path, enum autofs_notify notify) { struct dentry *dentry = path->dentry; @@ -399,7 +395,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) qstr.len = sprintf(name, "%p", dentry); else { - qstr.len = autofs4_getpath(sbi, dentry, &name); + qstr.len = autofs_getpath(sbi, dentry, name); if (!qstr.len) { kfree(name); return -ENOENT; @@ -430,15 +426,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -ENOMEM; } - wq->wait_queue_token = autofs4_next_wait_queue; - if (++autofs4_next_wait_queue == 0) - autofs4_next_wait_queue = 1; + wq->wait_queue_token = autofs_next_wait_queue; + if (++autofs_next_wait_queue == 0) + autofs_next_wait_queue = 1; wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); - wq->dev = autofs4_get_dev(sbi); - wq->ino = autofs4_get_ino(sbi); + wq->dev = autofs_get_dev(sbi); + wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); wq->pid = pid; @@ -467,9 +463,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, wq->name.name, notify); /* - * autofs4_notify_daemon() may block; it will unlock ->wq_mutex + * autofs_notify_daemon() may block; it will unlock ->wq_mutex */ - autofs4_notify_daemon(sbi, wq, type); + autofs_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", @@ -500,12 +496,12 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *de = NULL; /* direct mount or browsable map */ - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (!ino) { /* If not lookup actual dentry used */ de = d_lookup(dentry->d_parent, &dentry->d_name); if (de) - ino = autofs4_dentry_ino(de); + ino = autofs_dentry_ino(de); } /* Set mount requester */ @@ -530,7 +526,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, } -int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +int autofs_wait_release(struct autofs_sb_info *sbi, + autofs_wqt_t wait_queue_token, int status) { struct autofs_wait_queue *wq, **wql; diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig index 44727bf18297..99fda4d6da25 100644 --- a/fs/autofs4/Kconfig +++ b/fs/autofs4/Kconfig @@ -1,5 +1,7 @@ config AUTOFS4_FS - tristate "Kernel automounter version 4 support (also supports v3)" + tristate "Kernel automounter version 4 support (also supports v3 and v5)" + default n + depends on AUTOFS_FS = n help The automounter is a tool to automatically mount remote file systems on demand. This implementation is partially kernel-based to reduce @@ -7,14 +9,38 @@ config AUTOFS4_FS automounter (amd), which is a pure user space daemon. To use the automounter you need the user-space tools from - <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also - want to answer Y to "NFS file system support", below. + <https://www.kernel.org/pub/linux/daemons/autofs/>; you also want + to answer Y to "NFS file system support", below. - To compile this support as a module, choose M here: the module will be - called autofs4. You will need to add "alias autofs autofs4" to your - modules configuration file. + This module is in the process of being renamed from autofs4 to + autofs. Since autofs is now the only module that provides the + autofs file system the module is not version 4 specific. - If you are not a part of a fairly large, distributed network or - don't have a laptop which needs to dynamically reconfigure to the - local network, you probably do not need an automounter, and can say - N here. + The autofs4 module is now built from the source located in + fs/autofs. The autofs4 directory and its configuration entry + will be removed two kernel versions from the inclusion of this + change. + + Changes that will need to be made should be limited to: + - source include statments should be changed from autofs_fs4.h to + autofs_fs.h since these two header files have been merged. + - user space scripts that manually load autofs4.ko should be + changed to load autofs.ko. But since the module directory name + and the module name are the same as the file system name there + is no need to manually load module. + - any "alias autofs autofs4" will need to be removed. + - due to the autofs4 module directory name not being the same as + its file system name autoloading didn't work properly. Because + of this kernel configurations would often build the module into + the kernel. This may have resulted in selinux policies that will + prevent the autofs module from autoloading and will need to be + updated. + + Please configure AUTOFS_FS instead of AUTOFS4_FS from now on. + + NOTE: Since the modules autofs and autofs4 use the same file system + type name of "autofs" only one can be built. The "depends" + above will result in AUTOFS4_FS not appearing in .config for + any setting of AUTOFS_FS other than n and AUTOFS4_FS will + appear under the AUTOFS_FS entry otherwise which is intended + to draw attention to the module rename change. diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile index a811c1f7d9ab..417dd726d9ef 100644 --- a/fs/autofs4/Makefile +++ b/fs/autofs4/Makefile @@ -4,4 +4,6 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4.o -autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o +autofs4-objs := ../autofs/init.o ../autofs/inode.o ../autofs/root.o \ + ../autofs/symlink.o ../autofs/waitq.o ../autofs/expire.o \ + ../autofs/dev-ioctl.o diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a41b48f82a70..4de191563261 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -387,8 +387,13 @@ static Node *create_entry(const char __user *buffer, size_t count) s = strchr(p, del); if (!s) goto einval; - *s++ = '\0'; - e->offset = simple_strtoul(p, &p, 10); + *s = '\0'; + if (p != s) { + int r = kstrtoint(p, 10, &e->offset); + if (r != 0 || e->offset < 0) + goto einval; + } + p = s; if (*p++) goto einval; pr_debug("register: offset: %#x\n", e->offset); @@ -428,7 +433,8 @@ static Node *create_entry(const char __user *buffer, size_t count) if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto einval; - if (e->size + e->offset > BINPRM_BUF_SIZE) + if (e->size > BINPRM_BUF_SIZE || + BINPRM_BUF_SIZE - e->size < e->offset) goto einval; pr_debug("register: magic/mask length: %i\n", e->size); if (USE_DEBUG) { diff --git a/fs/block_dev.c b/fs/block_dev.c index bef6934b6189..05e12aea2404 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -216,6 +216,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; + bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) @@ -355,6 +356,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; + bio->bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { diff --git a/fs/buffer.c b/fs/buffer.c index 249b83fafe48..cabc045f483d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3427,120 +3427,6 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); -/* - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. - * - * Returns the offset within the file on success, and -ENOENT otherwise. - */ -static loff_t -page_seek_hole_data(struct page *page, loff_t lastoff, int whence) -{ - loff_t offset = page_offset(page); - struct buffer_head *bh, *head; - bool seek_data = whence == SEEK_DATA; - - if (lastoff < offset) - lastoff = offset; - - bh = head = page_buffers(page); - do { - offset += bh->b_size; - if (lastoff >= offset) - continue; - - /* - * Unwritten extents that have data in the page cache covering - * them can be identified by the BH_Unwritten state flag. - * Pages with multiple buffers might have a mix of holes, data - * and unwritten extents - any buffer with valid data in it - * should have BH_Uptodate flag set on it. - */ - - if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data) - return lastoff; - - lastoff = offset; - } while ((bh = bh->b_this_page) != head); - return -ENOENT; -} - -/* - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. - * - * Within unwritten extents, the page cache determines which parts are holes - * and which are data: unwritten and uptodate buffer heads count as data; - * everything else counts as a hole. - * - * Returns the resulting offset on successs, and -ENOENT otherwise. - */ -loff_t -page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, - int whence) -{ - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); - loff_t lastoff = offset; - struct pagevec pvec; - - if (length <= 0) - return -ENOENT; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - * - * If current page offset is beyond where we've ended, - * we've found a hole. - */ - if (whence == SEEK_HOLE && - lastoff < page_offset(page)) - goto check_range; - - lock_page(page); - if (likely(page->mapping == inode->i_mapping) && - page_has_buffers(page)) { - lastoff = page_seek_hole_data(page, lastoff, whence); - if (lastoff >= 0) { - unlock_page(page); - goto check_range; - } - } - unlock_page(page); - lastoff = page_offset(page) + PAGE_SIZE; - } - pagevec_release(&pvec); - } while (index < end); - - /* When no page at lastoff and we are not done, we found a hole. */ - if (whence != SEEK_HOLE) - goto not_found; - -check_range: - if (lastoff < offset + length) - goto out; -not_found: - lastoff = -ENOENT; -out: - pagevec_release(&pvec); - return lastoff; -} - void __init buffer_init(void) { unsigned long nrpages; diff --git a/fs/compat.c b/fs/compat.c index 190b38b39d9e..4a0aaaf53217 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -16,79 +16,12 @@ */ #include <linux/compat.h> -#include <linux/ncp_mount.h> #include <linux/nfs4_mount.h> #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/uaccess.h> #include "internal.h" -struct compat_ncp_mount_data { - compat_int_t version; - compat_uint_t ncp_fd; - __compat_uid_t mounted_uid; - compat_pid_t wdog_pid; - unsigned char mounted_vol[NCP_VOLNAME_LEN + 1]; - compat_uint_t time_out; - compat_uint_t retry_count; - compat_uint_t flags; - __compat_uid_t uid; - __compat_gid_t gid; - compat_mode_t file_mode; - compat_mode_t dir_mode; -}; - -struct compat_ncp_mount_data_v4 { - compat_int_t version; - compat_ulong_t flags; - compat_ulong_t mounted_uid; - compat_long_t wdog_pid; - compat_uint_t ncp_fd; - compat_uint_t time_out; - compat_uint_t retry_count; - compat_ulong_t uid; - compat_ulong_t gid; - compat_ulong_t file_mode; - compat_ulong_t dir_mode; -}; - -static void *do_ncp_super_data_conv(void *raw_data) -{ - int version = *(unsigned int *)raw_data; - - if (version == 3) { - struct compat_ncp_mount_data *c_n = raw_data; - struct ncp_mount_data *n = raw_data; - - n->dir_mode = c_n->dir_mode; - n->file_mode = c_n->file_mode; - n->gid = c_n->gid; - n->uid = c_n->uid; - memmove (n->mounted_vol, c_n->mounted_vol, (sizeof (c_n->mounted_vol) + 3 * sizeof (unsigned int))); - n->wdog_pid = c_n->wdog_pid; - n->mounted_uid = c_n->mounted_uid; - } else if (version == 4) { - struct compat_ncp_mount_data_v4 *c_n = raw_data; - struct ncp_mount_data_v4 *n = raw_data; - - n->dir_mode = c_n->dir_mode; - n->file_mode = c_n->file_mode; - n->gid = c_n->gid; - n->uid = c_n->uid; - n->retry_count = c_n->retry_count; - n->time_out = c_n->time_out; - n->ncp_fd = c_n->ncp_fd; - n->wdog_pid = c_n->wdog_pid; - n->mounted_uid = c_n->mounted_uid; - n->flags = c_n->flags; - } else if (version != 5) { - return NULL; - } - - return raw_data; -} - - struct compat_nfs_string { compat_uint_t len; compat_uptr_t data; @@ -154,7 +87,6 @@ static int do_nfs4_super_data_conv(void *raw_data) return 0; } -#define NCPFS_NAME "ncpfs" #define NFS4_NAME "nfs4" COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, @@ -183,9 +115,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, goto out2; if (kernel_type && options) { - if (!strcmp(kernel_type, NCPFS_NAME)) { - do_ncp_super_data_conv(options); - } else if (!strcmp(kernel_type, NFS4_NAME)) { + if (!strcmp(kernel_type, NFS4_NAME)) { retval = -EINVAL; if (do_nfs4_super_data_conv(options)) goto out3; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ef80085ed564..9907475b4226 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -38,8 +38,6 @@ #include <linux/ppp-ioctl.h> #include <linux/if_pppox.h> #include <linux/mtio.h> -#include <linux/auto_fs.h> -#include <linux/auto_fs4.h> #include <linux/tty.h> #include <linux/vt_kern.h> #include <linux/fb.h> diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index ce654526c0fb..243a269e6c5f 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -156,12 +156,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, } req = skcipher_request_alloc(tfm, gfp_flags); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, @@ -178,9 +174,10 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_skcipher_encrypt() returned %d\n", - __func__, res); + fscrypt_err(inode->i_sb, + "%scryption failed for inode %lu, block %llu: %d", + (rw == FS_DECRYPT ? "de" : "en"), + inode->i_ino, lblk_num, res); return res; } return 0; @@ -326,7 +323,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); @@ -353,7 +349,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) const struct dentry_operations fscrypt_d_ops = { .d_revalidate = fscrypt_d_revalidate, }; -EXPORT_SYMBOL(fscrypt_d_ops); void fscrypt_restore_control_page(struct page *page) { @@ -422,13 +417,43 @@ fail: return res; } +void fscrypt_msg(struct super_block *sb, const char *level, + const char *fmt, ...) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + struct va_format vaf; + va_list args; + + if (!__ratelimit(&rs)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf); + else + printk("%sfscrypt: %pV\n", level, &vaf); + va_end(args); +} + /** * fscrypt_init() - Set up for fs encryption. */ static int __init fscrypt_init(void) { + /* + * Use an unbound workqueue to allow bios to be decrypted in parallel + * even when they happen to complete on the same CPU. This sacrifices + * locality, but it's worthwhile since decryption is CPU-intensive. + * + * Also use a high-priority workqueue to prioritize decryption work, + * which blocks reads from completing, over regular application tasks. + */ fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", - WQ_HIGHPRI, 0); + WQ_UNBOUND | WQ_HIGHPRI, + num_online_cpus()); if (!fscrypt_read_workqueue) goto fail; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index e33f3d3c5ade..d7a0f682ca12 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -59,11 +59,8 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: skcipher_request_alloc() failed\n", __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); @@ -74,8 +71,9 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - printk_ratelimited(KERN_ERR - "%s: Error (error code %d)\n", __func__, res); + fscrypt_err(inode->i_sb, + "Filename encryption failed for inode %lu: %d", + inode->i_ino, res); return res; } @@ -96,23 +94,14 @@ static int fname_decrypt(struct inode *inode, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - struct fscrypt_info *ci = inode->i_crypt_info; - struct crypto_skcipher *tfm = ci->ci_ctfm; + struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - unsigned lim; - - lim = inode->i_sb->s_cop->max_namelen(inode); - if (iname->len <= 0 || iname->len > lim) - return -EIO; /* Allocate request */ req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); + if (!req) return -ENOMEM; - } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); @@ -127,8 +116,9 @@ static int fname_decrypt(struct inode *inode, res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { - printk_ratelimited(KERN_ERR - "%s: Error (error code %d)\n", __func__, res); + fscrypt_err(inode->i_sb, + "Filename decryption failed for inode %lu: %d", + inode->i_ino, res); return res; } @@ -341,12 +331,12 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } ret = fscrypt_get_encryption_info(dir); - if (ret && ret != -EOPNOTSUPP) + if (ret) return ret; if (dir->i_crypt_info) { if (!fscrypt_fname_encrypted_size(dir, iname->len, - dir->i_sb->s_cop->max_namelen(dir), + dir->i_sb->s_cop->max_namelen, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index ad6722bae8b7..37562394c5de 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -18,15 +18,7 @@ /* Encryption parameters */ #define FS_IV_SIZE 16 -#define FS_AES_128_ECB_KEY_SIZE 16 -#define FS_AES_128_CBC_KEY_SIZE 16 -#define FS_AES_128_CTS_KEY_SIZE 16 -#define FS_AES_256_GCM_KEY_SIZE 32 -#define FS_AES_256_CBC_KEY_SIZE 32 -#define FS_AES_256_CTS_KEY_SIZE 32 -#define FS_AES_256_XTS_KEY_SIZE 64 - -#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_KEY_DERIVATION_NONCE_SIZE 16 /** * Encryption context for inode @@ -91,6 +83,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode, filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) return true; + if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS) + return true; + return false; } @@ -106,6 +102,15 @@ extern int fscrypt_do_page_crypto(const struct inode *inode, gfp_t gfp_flags); extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags); +extern const struct dentry_operations fscrypt_d_ops; + +extern void __printf(3, 4) __cold +fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...); + +#define fscrypt_warn(sb, fmt, ...) \ + fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__) +#define fscrypt_err(sb, fmt, ...) \ + fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__) /* fname.c */ extern int fname_encrypt(struct inode *inode, const struct qstr *iname, diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index bec06490fb13..926e5df20ec3 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -39,8 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp) dir = dget_parent(file_dentry(filp)); if (IS_ENCRYPTED(d_inode(dir)) && !fscrypt_has_permitted_context(d_inode(dir), inode)) { - pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu", - d_inode(dir)->i_ino, inode->i_ino); + fscrypt_warn(inode->i_sb, + "inconsistent encryption contexts: %lu/%lu", + d_inode(dir)->i_ino, inode->i_ino); err = -EPERM; } dput(dir); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 05f5ee1f0705..e997ca51192f 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -19,17 +19,16 @@ static struct crypto_shash *essiv_hash_tfm; -/** - * derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivation. - * @source_key: Source key to which to apply derivation. - * @derived_raw_key: Derived raw key. +/* + * Key derivation function. This generates the derived key by encrypting the + * master key with AES-128-ECB using the inode's nonce as the AES key. * - * Return: Zero on success; non-zero otherwise. + * The master key must be at least as long as the derived key. If the master + * key is longer, then only the first 'derived_keysize' bytes are used. */ -static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - const struct fscrypt_key *source_key, - u8 derived_raw_key[FS_MAX_KEY_SIZE]) +static int derive_key_aes(const u8 *master_key, + const struct fscrypt_context *ctx, + u8 *derived_key, unsigned int derived_keysize) { int res = 0; struct skcipher_request *req = NULL; @@ -51,14 +50,13 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); - res = crypto_skcipher_setkey(tfm, deriving_key, - FS_AES_128_ECB_KEY_SIZE); + res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce)); if (res < 0) goto out; - sg_init_one(&src_sg, source_key->raw, source_key->size); - sg_init_one(&dst_sg, derived_raw_key, source_key->size); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + sg_init_one(&src_sg, master_key, derived_keysize); + sg_init_one(&dst_sg, derived_key, derived_keysize); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize, NULL); res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); out: @@ -67,101 +65,147 @@ out: return res; } -static int validate_user_key(struct fscrypt_info *crypt_info, - struct fscrypt_context *ctx, u8 *raw_key, - const char *prefix, int min_keysize) +/* + * Search the current task's subscribed keyrings for a "logon" key with + * description prefix:descriptor, and if found acquire a read lock on it and + * return a pointer to its validated payload in *payload_ret. + */ +static struct key * +find_and_lock_process_key(const char *prefix, + const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE], + unsigned int min_keysize, + const struct fscrypt_key **payload_ret) { char *description; - struct key *keyring_key; - struct fscrypt_key *master_key; + struct key *key; const struct user_key_payload *ukp; - int res; + const struct fscrypt_key *payload; description = kasprintf(GFP_NOFS, "%s%*phN", prefix, - FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); + FS_KEY_DESCRIPTOR_SIZE, descriptor); if (!description) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - keyring_key = request_key(&key_type_logon, description, NULL); + key = request_key(&key_type_logon, description, NULL); kfree(description); - if (IS_ERR(keyring_key)) - return PTR_ERR(keyring_key); - down_read(&keyring_key->sem); - - if (keyring_key->type != &key_type_logon) { - printk_once(KERN_WARNING - "%s: key type must be logon\n", __func__); - res = -ENOKEY; - goto out; - } - ukp = user_key_payload_locked(keyring_key); - if (!ukp) { - /* key was revoked before we acquired its semaphore */ - res = -EKEYREVOKED; - goto out; + if (IS_ERR(key)) + return key; + + down_read(&key->sem); + ukp = user_key_payload_locked(key); + + if (!ukp) /* was the key revoked before we acquired its semaphore? */ + goto invalid; + + payload = (const struct fscrypt_key *)ukp->data; + + if (ukp->datalen != sizeof(struct fscrypt_key) || + payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) { + fscrypt_warn(NULL, + "key with description '%s' has invalid payload", + key->description); + goto invalid; } - if (ukp->datalen != sizeof(struct fscrypt_key)) { - res = -EINVAL; - goto out; + + if (payload->size < min_keysize) { + fscrypt_warn(NULL, + "key with description '%s' is too short (got %u bytes, need %u+ bytes)", + key->description, payload->size, min_keysize); + goto invalid; } - master_key = (struct fscrypt_key *)ukp->data; - BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - - if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE - || master_key->size % AES_BLOCK_SIZE != 0) { - printk_once(KERN_WARNING - "%s: key size incorrect: %d\n", - __func__, master_key->size); - res = -ENOKEY; - goto out; + + *payload_ret = payload; + return key; + +invalid: + up_read(&key->sem); + key_put(key); + return ERR_PTR(-ENOKEY); +} + +/* Find the master key, then derive the inode's actual encryption key */ +static int find_and_derive_key(const struct inode *inode, + const struct fscrypt_context *ctx, + u8 *derived_key, unsigned int derived_keysize) +{ + struct key *key; + const struct fscrypt_key *payload; + int err; + + key = find_and_lock_process_key(FS_KEY_DESC_PREFIX, + ctx->master_key_descriptor, + derived_keysize, &payload); + if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) { + key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix, + ctx->master_key_descriptor, + derived_keysize, &payload); } - res = derive_key_aes(ctx->nonce, master_key, raw_key); -out: - up_read(&keyring_key->sem); - key_put(keyring_key); - return res; + if (IS_ERR(key)) + return PTR_ERR(key); + err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize); + up_read(&key->sem); + key_put(key); + return err; } -static const struct { +static struct fscrypt_mode { + const char *friendly_name; const char *cipher_str; int keysize; + bool logged_impl_name; } available_modes[] = { - [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", - FS_AES_256_XTS_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", - FS_AES_256_CTS_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", - FS_AES_128_CBC_KEY_SIZE }, - [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", - FS_AES_128_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_XTS] = { + .friendly_name = "AES-256-XTS", + .cipher_str = "xts(aes)", + .keysize = 64, + }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { + .friendly_name = "AES-256-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 32, + }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { + .friendly_name = "AES-128-CBC", + .cipher_str = "cbc(aes)", + .keysize = 16, + }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { + .friendly_name = "AES-128-CTS-CBC", + .cipher_str = "cts(cbc(aes))", + .keysize = 16, + }, + [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = { + .friendly_name = "Speck128/256-XTS", + .cipher_str = "xts(speck128)", + .keysize = 64, + }, + [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = { + .friendly_name = "Speck128/256-CTS-CBC", + .cipher_str = "cts(cbc(speck128))", + .keysize = 32, + }, }; -static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, - const char **cipher_str_ret, int *keysize_ret) +static struct fscrypt_mode * +select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode) { - u32 mode; - if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { - pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", - inode->i_ino, - ci->ci_data_mode, ci->ci_filename_mode); - return -EINVAL; + fscrypt_warn(inode->i_sb, + "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)", + inode->i_ino, ci->ci_data_mode, + ci->ci_filename_mode); + return ERR_PTR(-EINVAL); } - if (S_ISREG(inode->i_mode)) { - mode = ci->ci_data_mode; - } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - mode = ci->ci_filename_mode; - } else { - WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", - inode->i_ino, (inode->i_mode & S_IFMT)); - return -EINVAL; - } + if (S_ISREG(inode->i_mode)) + return &available_modes[ci->ci_data_mode]; + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + return &available_modes[ci->ci_filename_mode]; - *cipher_str_ret = available_modes[mode].cipher_str; - *keysize_ret = available_modes[mode].keysize; - return 0; + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return ERR_PTR(-EINVAL); } static void put_crypt_info(struct fscrypt_info *ci) @@ -184,8 +228,9 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) tfm = crypto_alloc_shash("sha256", 0, 0); if (IS_ERR(tfm)) { - pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", - PTR_ERR(tfm)); + fscrypt_warn(NULL, + "error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); return PTR_ERR(tfm); } prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); @@ -245,8 +290,7 @@ int fscrypt_get_encryption_info(struct inode *inode) struct fscrypt_info *crypt_info; struct fscrypt_context ctx; struct crypto_skcipher *ctfm; - const char *cipher_str; - int keysize; + struct fscrypt_mode *mode; u8 *raw_key = NULL; int res; @@ -290,57 +334,59 @@ int fscrypt_get_encryption_info(struct inode *inode) memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); - if (res) + mode = select_encryption_mode(crypt_info, inode); + if (IS_ERR(mode)) { + res = PTR_ERR(mode); goto out; + } /* * This cannot be a stack buffer because it is passed to the scatterlist * crypto API as part of key derivation. */ res = -ENOMEM; - raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + raw_key = kmalloc(mode->keysize, GFP_NOFS); if (!raw_key) goto out; - res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, - keysize); - if (res && inode->i_sb->s_cop->key_prefix) { - int res2 = validate_user_key(crypt_info, &ctx, raw_key, - inode->i_sb->s_cop->key_prefix, - keysize); - if (res2) { - if (res2 == -ENOKEY) - res = -ENOKEY; - goto out; - } - } else if (res) { + res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize); + if (res) goto out; - } - ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", - __func__, res, inode->i_ino); + + ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0); + if (IS_ERR(ctfm)) { + res = PTR_ERR(ctfm); + fscrypt_warn(inode->i_sb, + "error allocating '%s' transform for inode %lu: %d", + mode->cipher_str, inode->i_ino, res); goto out; } + if (unlikely(!mode->logged_impl_name)) { + /* + * fscrypt performance can vary greatly depending on which + * crypto algorithm implementation is used. Help people debug + * performance problems by logging the ->cra_driver_name the + * first time a mode is used. Note that multiple threads can + * race here, but it doesn't really matter. + */ + mode->logged_impl_name = true; + pr_info("fscrypt: %s using implementation \"%s\"\n", + mode->friendly_name, + crypto_skcipher_alg(ctfm)->base.cra_driver_name); + } crypt_info->ci_ctfm = ctfm; - crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - /* - * if the provided key is longer than keysize, we use the first - * keysize bytes of the derived key only - */ - res = crypto_skcipher_setkey(ctfm, raw_key, keysize); + res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize); if (res) goto out; if (S_ISREG(inode->i_mode) && crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { - res = init_essiv_generator(crypt_info, raw_key, keysize); + res = init_essiv_generator(crypt_info, raw_key, mode->keysize); if (res) { - pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", - __func__, res, inode->i_ino); + fscrypt_warn(inode->i_sb, + "error initializing ESSIV generator for inode %lu: %d", + inode->i_ino, res); goto out; } } @@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, } } +static struct page *dax_busy_page(void *entry) +{ + unsigned long pfn; + + for_each_mapped_pfn(entry, pfn) { + struct page *page = pfn_to_page(pfn); + + if (page_ref_count(page) > 1) + return page; + } + return NULL; +} + /* * Find radix tree entry at given index. If it points to an exceptional entry, * return it with the radix tree entry locked. If the radix tree doesn't @@ -492,6 +505,90 @@ restart: return entry; } +/** + * dax_layout_busy_page - find first pinned page in @mapping + * @mapping: address space to scan for a page with ref count > 1 + * + * DAX requires ZONE_DEVICE mapped pages. These pages are never + * 'onlined' to the page allocator so they are considered idle when + * page->count == 1. A filesystem uses this interface to determine if + * any page in the mapping is busy, i.e. for DMA, or other + * get_user_pages() usages. + * + * It is expected that the filesystem is holding locks to block the + * establishment of new mappings in this address_space. I.e. it expects + * to be able to run unmap_mapping_range() and subsequently not race + * mapping_mapped() becoming true. + */ +struct page *dax_layout_busy_page(struct address_space *mapping) +{ + pgoff_t indices[PAGEVEC_SIZE]; + struct page *page = NULL; + struct pagevec pvec; + pgoff_t index, end; + unsigned i; + + /* + * In the 'limited' case get_user_pages() for dax is disabled. + */ + if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) + return NULL; + + if (!dax_mapping(mapping) || !mapping_mapped(mapping)) + return NULL; + + pagevec_init(&pvec); + index = 0; + end = -1; + + /* + * If we race get_user_pages_fast() here either we'll see the + * elevated page count in the pagevec_lookup and wait, or + * get_user_pages_fast() will see that the page it took a reference + * against is no longer mapped in the page tables and bail to the + * get_user_pages() slow path. The slow path is protected by + * pte_lock() and pmd_lock(). New references are not taken without + * holding those locks, and unmap_mapping_range() will not zero the + * pte or pmd without holding the respective lock, so we are + * guaranteed to either see new references or prevent new + * references from being established. + */ + unmap_mapping_range(mapping, 0, 0, 1); + + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *pvec_ent = pvec.pages[i]; + void *entry; + + index = indices[i]; + if (index >= end) + break; + + if (!radix_tree_exceptional_entry(pvec_ent)) + continue; + + xa_lock_irq(&mapping->i_pages); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (entry) + page = dax_busy_page(entry); + put_unlocked_mapping_entry(mapping, index, entry); + xa_unlock_irq(&mapping->i_pages); + if (page) + break; + } + pagevec_remove_exceptionals(&pvec); + pagevec_release(&pvec); + index++; + + if (page) + break; + } + return page; +} +EXPORT_SYMBOL_GPL(dax_layout_busy_page); + static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -905,14 +1002,13 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static int dax_load_hole(struct address_space *mapping, void *entry, +static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, struct vm_fault *vmf) { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; - int ret = VM_FAULT_NOPAGE; + vm_fault_t ret = VM_FAULT_NOPAGE; struct page *zero_page; - void *entry2; pfn_t pfn; zero_page = ZERO_PAGE(0); @@ -922,14 +1018,9 @@ static int dax_load_hole(struct address_space *mapping, void *entry, } pfn = page_to_pfn_t(zero_page); - entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn, - RADIX_DAX_ZERO_PAGE, false); - if (IS_ERR(entry2)) { - ret = VM_FAULT_SIGBUS; - goto out; - } - - vm_insert_mixed(vmf->vma, vaddr, pfn); + dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE, + false); + ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); out: trace_dax_load_hole(inode, vmf, ret); return ret; @@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; + size_t xfer; int id; if (iov_iter_rw(iter) == READ) { @@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * vfs_write(), depending on which operation we are doing. */ if (iov_iter_rw(iter) == WRITE) - map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, + xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); else - map_len = copy_to_iter(kaddr, map_len, iter); - if (map_len <= 0) { - ret = map_len ? map_len : -EFAULT; - break; - } + xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, + map_len, iter); + + pos += xfer; + length -= xfer; + done += xfer; - pos += map_len; - length -= map_len; - done += map_len; + if (xfer == 0) + ret = -EFAULT; + if (xfer < map_len) + break; } dax_read_unlock(id); @@ -1112,7 +1206,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(dax_iomap_rw); -static int dax_fault_return(int error) +static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; @@ -1132,7 +1226,7 @@ static bool dax_fault_is_synchronous(unsigned long flags, && (iomap->flags & IOMAP_F_DIRTY); } -static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1145,18 +1239,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; - int vmf_ret = 0; + vm_fault_t ret = 0; void *entry; pfn_t pfn; - trace_dax_pte_fault(inode, vmf, vmf_ret); + trace_dax_pte_fault(inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ if (pos >= i_size_read(inode)) { - vmf_ret = VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; goto out; } @@ -1165,7 +1259,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); + ret = dax_fault_return(PTR_ERR(entry)); goto out; } @@ -1176,7 +1270,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * retried. */ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { - vmf_ret = VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; goto unlock_entry; } @@ -1189,7 +1283,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (iomap_errp) *iomap_errp = error; if (error) { - vmf_ret = dax_fault_return(error); + ret = dax_fault_return(error); goto unlock_entry; } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { @@ -1219,9 +1313,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); - vmf_ret = finish_fault(vmf); - if (!vmf_ret) - vmf_ret = VM_FAULT_DONE_COW; + ret = finish_fault(vmf); + if (!ret) + ret = VM_FAULT_DONE_COW; goto finish_iomap; } @@ -1240,10 +1334,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, 0, write && !sync); - if (IS_ERR(entry)) { - error = PTR_ERR(entry); - goto error_finish_iomap; - } /* * If we are doing synchronous page fault and inode needs fsync, @@ -1257,23 +1347,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto error_finish_iomap; } *pfnp = pfn; - vmf_ret = VM_FAULT_NEEDDSYNC | major; + ret = VM_FAULT_NEEDDSYNC | major; goto finish_iomap; } trace_dax_insert_mapping(inode, vmf, entry); if (write) - error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); else - error = vm_insert_mixed(vma, vaddr, pfn); + ret = vmf_insert_mixed(vma, vaddr, pfn); - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if (error == -EBUSY) - error = 0; - break; + goto finish_iomap; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { - vmf_ret = dax_load_hole(mapping, entry, vmf); + ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1284,12 +1371,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } error_finish_iomap: - vmf_ret = dax_fault_return(error) | major; + ret = dax_fault_return(error); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; - if (vmf_ret & VM_FAULT_ERROR) + if (ret & VM_FAULT_ERROR) copied = 0; /* * The fault is done by now and there's no way back (other @@ -1302,12 +1389,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff); out: - trace_dax_pte_fault_done(inode, vmf, vmf_ret); - return vmf_ret; + trace_dax_pte_fault_done(inode, vmf, ret); + return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, +static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1327,8 +1414,6 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, pfn = page_to_pfn_t(zero_page); ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); - if (IS_ERR(ret)) - goto fallback; ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1348,7 +1433,7 @@ fallback: return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1358,7 +1443,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; - int result = VM_FAULT_FALLBACK; + vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; void *entry; @@ -1450,8 +1535,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_PMD, write && !sync); - if (IS_ERR(entry)) - goto finish_iomap; /* * If we are doing synchronous page fault and inode needs fsync, @@ -1509,7 +1592,7 @@ out: return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1529,7 +1612,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { switch (pe_size) { @@ -1553,14 +1636,14 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); * DAX file. It takes care of marking corresponding radix tree entry as dirty * as well. */ -static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, +static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t pfn) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *entry, **slot; pgoff_t index = vmf->pgoff; - int vmf_ret, error; + vm_fault_t ret; xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); @@ -1579,21 +1662,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: - error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - vmf_ret = dax_fault_return(error); + ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); break; #ifdef CONFIG_FS_DAX_PMD case PE_SIZE_PMD: - vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, true); break; #endif default: - vmf_ret = VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; } put_locked_mapping_entry(mapping, index); - trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); - return vmf_ret; + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); + return ret; } /** @@ -1606,8 +1688,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, * stored persistently on the media and handles inserting of appropriate page * table entry. */ -int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 1f99678ff5d3..4fce1da7db23 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -796,19 +796,13 @@ EXPORT_SYMBOL_GPL(debugfs_read_file_bool); ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { - char buf[32]; - size_t buf_size; bool bv; int r; bool *val = file->private_data; struct dentry *dentry = F_DENTRY(file); - buf_size = min(count, (sizeof(buf)-1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - if (strtobool(buf, &bv) == 0) { + r = kstrtobool_from_user(user_buf, count, &bv); + if (!r) { r = debugfs_file_get(dentry); if (unlikely(r)) return r; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 13b01351dd1c..a913b12fc7f8 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -512,7 +512,9 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) if (unlikely(!inode)) return failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + if (!parent) + parent = debugfs_mount->mnt_root; + inode->i_mode = S_IFDIR | ((d_inode(parent)->i_mode & 0770)); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; diff --git a/fs/exec.c b/fs/exec.c index 2c3911612b22..2d4e0075bd24 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1706,14 +1706,13 @@ static int exec_binprm(struct linux_binprm *bprm) /* * sys_execve() executes a new program. */ -static int do_execveat_common(int fd, struct filename *filename, - struct user_arg_ptr argv, - struct user_arg_ptr envp, - int flags) +static int __do_execve_file(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags, struct file *file) { char *pathbuf = NULL; struct linux_binprm *bprm; - struct file *file; struct files_struct *displaced; int retval; @@ -1752,7 +1751,8 @@ static int do_execveat_common(int fd, struct filename *filename, check_unsafe_exec(bprm); current->in_execve = 1; - file = do_open_execat(fd, filename, flags); + if (!file) + file = do_open_execat(fd, filename, flags); retval = PTR_ERR(file); if (IS_ERR(file)) goto out_unmark; @@ -1760,7 +1760,9 @@ static int do_execveat_common(int fd, struct filename *filename, sched_exec(); bprm->file = file; - if (fd == AT_FDCWD || filename->name[0] == '/') { + if (!filename) { + bprm->filename = "none"; + } else if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') @@ -1827,7 +1829,8 @@ static int do_execveat_common(int fd, struct filename *filename, task_numa_free(current); free_bprm(bprm); kfree(pathbuf); - putname(filename); + if (filename) + putname(filename); if (displaced) put_files_struct(displaced); return retval; @@ -1850,10 +1853,27 @@ out_files: if (displaced) reset_files_struct(displaced); out_ret: - putname(filename); + if (filename) + putname(filename); return retval; } +static int do_execveat_common(int fd, struct filename *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + int flags) +{ + return __do_execve_file(fd, filename, argv, envp, flags, NULL); +} + +int do_execve_file(struct file *file, void *__argv, void *__envp) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + + return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file); +} + int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index de1694512f1f..c09289a42dc5 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - err = bdev_dax_supported(sb, blocksize); - if (err) { + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); sbi->s_mount_opt &= ~EXT2_MOUNT_DAX; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 508b905d744d..b00481c475cb 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -185,25 +185,15 @@ static int ext4_init_block_bitmap(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; int flex_bg = 0; - struct ext4_group_info *grp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT | + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); @@ -375,7 +365,6 @@ static int ext4_validate_block_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); - struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return 0; @@ -387,10 +376,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, desc, bh))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); @@ -398,10 +385,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); @@ -436,6 +421,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); @@ -514,6 +501,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, ext4_error(sb, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 229ea4da6785..df95412915ea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2530,6 +2530,9 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); +extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t block_group, + unsigned int flags); extern __printf(4, 5) void __ext4_error(struct super_block *, const char *, unsigned int, @@ -2857,6 +2860,10 @@ struct ext4_group_info { #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ + (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 763ef185dd17..c4e6fb15101b 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -162,8 +162,7 @@ int __init ext4_init_es(void) void ext4_exit_es(void) { - if (ext4_es_cachep) - kmem_cache_destroy(ext4_es_cachep); + kmem_cache_destroy(ext4_es_cachep); } void ext4_es_init_tree(struct ext4_es_tree *tree) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index fb6f023622fe..7f8023340eb8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -277,10 +277,11 @@ out: } #ifdef CONFIG_FS_DAX -static int ext4_dax_huge_fault(struct vm_fault *vmf, +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { - int result, error = 0; + int error = 0; + vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -335,7 +336,7 @@ retry: return result; } -static int ext4_dax_fault(struct vm_fault *vmf) +static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } @@ -380,50 +381,64 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static int ext4_file_open(struct inode * inode, struct file * filp) +static int ext4_sample_last_mounted(struct super_block *sb, + struct vfsmount *mnt) { - struct super_block *sb = inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct vfsmount *mnt = filp->f_path.mnt; + struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; + handle_t *handle; + int err; + + if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED)) + return 0; + + if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) + return 0; + + sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; + /* + * Sample where the filesystem has been mounted and + * store it in the superblock for sysadmin convenience + * when trying to sort through large numbers of block + * devices or filesystem images. + */ + memset(buf, 0, sizeof(buf)); + path.mnt = mnt; + path.dentry = mnt->mnt_root; + cp = d_path(&path, buf, sizeof(buf)); + err = 0; + if (IS_ERR(cp)) + goto out; + + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + err = PTR_ERR(handle); + if (IS_ERR(handle)) + goto out; + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_journal; + strlcpy(sbi->s_es->s_last_mounted, cp, + sizeof(sbi->s_es->s_last_mounted)); + ext4_handle_dirty_super(handle, sb); +out_journal: + ext4_journal_stop(handle); +out: + sb_end_intwrite(sb); + return err; +} + +static int ext4_file_open(struct inode * inode, struct file * filp) +{ int ret; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; - if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && - !sb_rdonly(sb))) { - sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED; - /* - * Sample where the filesystem has been mounted and - * store it in the superblock for sysadmin convenience - * when trying to sort through large numbers of block - * devices or filesystem images. - */ - memset(buf, 0, sizeof(buf)); - path.mnt = mnt; - path.dentry = mnt->mnt_root; - cp = d_path(&path, buf, sizeof(buf)); - if (!IS_ERR(cp)) { - handle_t *handle; - int err; - - handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) { - ext4_journal_stop(handle); - return err; - } - strlcpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); - ext4_handle_dirty_super(handle, sb); - ext4_journal_stop(handle); - } - } + ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); + if (ret) + return ret; ret = fscrypt_file_open(inode, filp); if (ret) diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index e871c4bf18e9..4b99e2db95b8 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -402,8 +402,8 @@ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) } /* Find all the fixed metadata in the filesystem. */ -int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, - struct list_head *meta_list) +static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) { struct ext4_group_desc *gdp; ext4_group_t agno; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index df92e3ec9913..4d6e007f3569 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -83,7 +83,6 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, { ext4_fsblk_t blk; struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); - struct ext4_sb_info *sbi = EXT4_SB(sb); if (buffer_verified(bh)) return 0; @@ -97,14 +96,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, ext4_unlock_group(sb, block_group); ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " "inode_bitmap = %llu", block_group, blk); - grp = ext4_get_group_info(sb, block_group); - if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, desc); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } set_buffer_verified(bh); @@ -136,6 +129,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid inode bitmap blk %llu in " "block_group %u", bitmap_blk, block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); @@ -143,7 +138,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); - return ERR_PTR(-EIO); + return ERR_PTR(-ENOMEM); } if (bitmap_uptodate(bh)) goto verify; @@ -190,6 +185,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) ext4_error(sb, "Cannot read inode bitmap - " "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); return ERR_PTR(-EIO); } @@ -337,13 +334,8 @@ out: fatal = err; } else { ext4_error(sb, "bit already cleared for inode %lu", ino); - if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - int count; - count = ext4_free_inodes_count(sb, gdp); - percpu_counter_sub(&sbi->s_freeinodes_counter, - count); - } - set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); } error_return: @@ -914,6 +906,8 @@ repeat_in_this_group: if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto next_group; } @@ -1105,6 +1099,8 @@ got: err = -EIO; ext4_error(sb, "failed to insert inode %lu: doubly allocated?", inode->i_ino); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); goto out; } inode->i_generation = prandom_u32(); @@ -1206,11 +1202,8 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - ext4_error(sb, "inode bitmap error %ld for orphan %lu", - ino, PTR_ERR(bitmap_bh)); + if (IS_ERR(bitmap_bh)) return (struct inode *) bitmap_bh; - } /* Having the inode bit set should be a 100% indicator that this * is a valid orphan (no e2fsck run on fs). Orphans also include diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index c32802c956d5..bf7fa1507e81 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -561,10 +561,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, unsigned epb = inode->i_sb->s_blocksize / sizeof(u32); int i; - /* Count number blocks in a subtree under 'partial' */ - count = 1; - for (i = 0; partial + i != chain + depth - 1; i++) - count *= epb; + /* + * Count number blocks in a subtree under 'partial'. At each + * level we count number of complete empty subtrees beyond + * current offset and then descend into the subtree only + * partially beyond current offset. + */ + count = 0; + for (i = partial - chain + 1; i < depth; i++) + count = count * epb + (epb - offsets[i] - 1); + count++; /* Fill in size of a hole we found */ map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, count); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 70cf4c7b268a..285ed1588730 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -144,6 +144,12 @@ int ext4_find_inline_data_nolock(struct inode *inode) goto out; if (!is.s.not_found) { + if (is.s.here->e_value_inum) { + EXT4_ERROR_INODE(inode, "inline data xattr refers " + "to an external xattr inode"); + error = -EFSCORRUPTED; + goto out; + } EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here - (void *)ext4_raw_inode(&is.iloc)); EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE + @@ -1835,8 +1841,8 @@ int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) iomap->offset = 0; iomap->length = min_t(loff_t, ext4_get_inline_size(inode), i_size_read(inode)); - iomap->type = 0; - iomap->flags = IOMAP_F_DATA_INLINE; + iomap->type = IOMAP_INLINE; + iomap->flags = 0; out: up_read(&EXT4_I(inode)->xattr_sem); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1e50c5efae67..2ea07efbe016 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4298,28 +4298,28 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) EXT4_BLOCK_SIZE_BITS(sb); stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); - /* If there are no blocks to remove, return now */ - if (first_block >= stop_block) - goto out_stop; + /* If there are blocks to remove, do it */ + if (stop_block > first_block) { - down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_preallocations(inode); + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); - ret = ext4_es_remove_extent(inode, first_block, - stop_block - first_block); - if (ret) { - up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; - } + ret = ext4_es_remove_extent(inode, first_block, + stop_block - first_block); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); - else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + ret = ext4_ext_remove_space(inode, first_block, + stop_block - 1); + else + ret = ext4_ind_remove_space(handle, inode, first_block, + stop_block); - up_write(&EXT4_I(inode)->i_data_sem); + up_write(&EXT4_I(inode)->i_data_sem); + } if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4701,19 +4701,21 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, } } -static inline void ext4_iget_extra_inode(struct inode *inode, +static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= EXT4_INODE_SIZE(inode->i_sb) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); - ext4_find_inline_data_nolock(inode); + return ext4_find_inline_data_nolock(inode); } else EXT4_I(inode)->i_inline_off = 0; + return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) @@ -4724,6 +4726,26 @@ int ext4_get_projid(struct inode *inode, kprojid_t *projid) return 0; } +/* + * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of + * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag + * set. + */ +static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + inode_set_iversion_raw(inode, val); + else + inode_set_iversion_queried(inode, val); +} +static inline u64 ext4_inode_peek_iversion(const struct inode *inode) +{ + if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) + return inode_peek_iversion_raw(inode); + else + return inode_peek_iversion(inode); +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -4893,7 +4915,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { - ext4_iget_extra_inode(inode, raw_inode, ei); + ret = ext4_iget_extra_inode(inode, raw_inode, ei); + if (ret) + goto bad_inode; } } @@ -4910,7 +4934,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } - inode_set_iversion_queried(inode, ivers); + ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; @@ -4945,6 +4969,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { + /* VFS does not allow setting these so must be corruption */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + EXT4_ERROR_INODE(inode, + "immutable or append flags not allowed on symlinks"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if (ext4_encrypted_inode(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; ext4_set_aops(inode); @@ -5196,7 +5227,7 @@ static int ext4_do_update_inode(handle_t *handle, } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { - u64 ivers = inode_peek_iversion(inode); + u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6884e81c1465..6eae2b91aafa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -470,6 +470,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, "freeing block already freed " "(bit %u)", first + i); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -747,10 +749,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, * corrupt and update bb_free using bitmap value */ grp->bb_free = free; - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - grp->bb_free); - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); @@ -1454,12 +1454,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, "freeing already freed block " "(bit %u); block bitmap corrupt.", block); - if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)) - percpu_counter_sub(&sbi->s_freeclusters_counter, - e4b->bd_info->bb_free); - /* Mark the block group as corrupt. */ - set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, - &e4b->bd_info->bb_state); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); mb_regenerate_buddy(e4b); goto done; } @@ -1956,6 +1952,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, "%d free clusters as per " "group info. But bitmap says 0", free); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); break; } @@ -1966,6 +1964,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); + ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit @@ -2516,8 +2516,7 @@ static void ext4_groupinfo_destroy_slabs(void) int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { - if (ext4_groupinfo_caches[i]) - kmem_cache_destroy(ext4_groupinfo_caches[i]); + kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b6bec270a8e4..d792b7689d92 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1933,7 +1933,7 @@ retry: return 0; n_group = ext4_get_group_number(sb, n_blocks_count - 1); - if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { + if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) { ext4_warning(sb, "resize would cause inodes_count overflow"); return -EINVAL; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eb104e8476f0..00fe75a71c4b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -763,6 +763,36 @@ __acquires(bitlock) return; } +void ext4_mark_group_bitmap_corrupted(struct super_block *sb, + ext4_group_t group, + unsigned int flags) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) && + !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { + percpu_counter_sub(&sbi->s_freeclusters_counter, + grp->bb_free); + set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, + &grp->bb_state); + } + + if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) && + !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { + if (gdp) { + int count; + + count = ext4_free_inodes_count(sb, gdp); + percpu_counter_sub(&sbi->s_freeinodes_counter, + count); + } + set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, + &grp->bb_state); + } +} + void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -1237,19 +1267,13 @@ static bool ext4_dummy_context(struct inode *inode) return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); } -static unsigned ext4_max_namelen(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : - EXT4_NAME_LEN; -} - static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .dummy_context = ext4_dummy_context, .empty_dir = ext4_empty_dir, - .max_namelen = ext4_max_namelen, + .max_namelen = EXT4_NAME_LEN, }; #endif @@ -2116,12 +2140,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int read_only) { struct ext4_sb_info *sbi = EXT4_SB(sb); - int res = 0; + int err = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); - res = SB_RDONLY; + err = -EROFS; } if (read_only) goto done; @@ -2154,7 +2178,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); - ext4_commit_super(sb, 1); + err = ext4_commit_super(sb, 1); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " @@ -2166,7 +2190,7 @@ done: sbi->s_mount_opt, sbi->s_mount_opt2); cleancache_init_fs(sb); - return res; + return err; } int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) @@ -3732,8 +3756,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " that may contain inline data"); sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; } - err = bdev_dax_supported(sb, blocksize); - if (err) { + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; @@ -4224,8 +4247,12 @@ no_journal: goto failed_mount4; } - if (ext4_setup_super(sb, es, sb_rdonly(sb))) + ret = ext4_setup_super(sb, es, sb_rdonly(sb)); + if (ret == -EROFS) { sb->s_flags |= SB_RDONLY; + ret = 0; + } else if (ret) + goto failed_mount4a; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && @@ -4760,11 +4787,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); - if (error) - return error; - - error = buffer_write_io_error(sbh); - if (error) { + if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); @@ -5165,8 +5188,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); - if (!ext4_setup_super(sb, es, 0)) - sb->s_flags &= ~SB_RDONLY; + + err = ext4_setup_super(sb, es, 0); + if (err) + goto restore_opts; + + sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) { @@ -5190,8 +5217,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) - ext4_commit_super(sb, 1); + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { + err = ext4_commit_super(sb, 1); + if (err) + goto restore_opts; + } #ifdef CONFIG_QUOTA /* Release old quota file names */ @@ -5252,7 +5282,8 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bsoftlimit : dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; if (limit && buf->f_blocks > limit) { - curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + curblock = (dquot->dq_dqb.dqb_curspace + + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; buf->f_blocks = limit; buf->f_bfree = buf->f_bavail = (buf->f_blocks > curblock) ? diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 499cb4b1fbd2..fc4ced59c565 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1688,7 +1688,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* No failures allowed past this point. */ - if (!s->not_found && here->e_value_offs) { + if (!s->not_found && here->e_value_size && here->e_value_offs) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 629001b28632..197a9d8a15ef 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -43,7 +43,7 @@ ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, err = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_SECURITY, xattr->name, xattr->value, - xattr->value_len, 0); + xattr->value_len, XATTR_CREATE); if (err < 0) break; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 42d564c5ccd0..970ae27f401c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1930,19 +1930,13 @@ static bool f2fs_dummy_context(struct inode *inode) return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); } -static unsigned f2fs_max_namelen(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? - inode->i_sb->s_blocksize : F2FS_NAME_LEN; -} - static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, .dummy_context = f2fs_dummy_context, .empty_dir = f2fs_empty_dir, - .max_namelen = f2fs_max_namelen, + .max_namelen = F2FS_NAME_LEN, }; #endif diff --git a/fs/fcntl.c b/fs/fcntl.c index c42169459298..12273b6ea56d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -23,7 +23,7 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> -#include <linux/shmem_fs.h> +#include <linux/memfd.h> #include <linux/compat.h> #include <linux/poll.h> diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index ec85765502f1..5a48cee6d7d3 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -34,7 +34,7 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) return ERR_PTR(-ENOMEM); size = fuse_getxattr(inode, name, value, PAGE_SIZE); if (size > 0) - acl = posix_acl_from_xattr(&init_user_ns, value, size); + acl = posix_acl_from_xattr(fc->user_ns, value, size); else if ((size == 0) || (size == -ENODATA) || (size == -EOPNOTSUPP && fc->no_getxattr)) acl = NULL; @@ -81,7 +81,7 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (!value) return -ENOMEM; - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + ret = posix_acl_to_xattr(fc->user_ns, acl, value, size); if (ret < 0) { kfree(value); return ret; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index b9ea99c5b5b3..0b694655d988 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -35,7 +35,7 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fuse_abort_conn(fc); + fuse_abort_conn(fc, true); fuse_conn_put(fc); } return count; @@ -211,10 +211,11 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (!dentry) return NULL; - fc->ctl_dentry[fc->ctl_ndents++] = dentry; inode = new_inode(fuse_control_sb); - if (!inode) + if (!inode) { + dput(dentry); return NULL; + } inode->i_ino = get_next_ino(); inode->i_mode = mode; @@ -228,6 +229,9 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, set_nlink(inode, nlink); inode->i_private = fc; d_add(dentry, inode); + + fc->ctl_dentry[fc->ctl_ndents++] = dentry; + return dentry; } @@ -284,7 +288,10 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) for (i = fc->ctl_ndents - 1; i >= 0; i--) { struct dentry *dentry = fc->ctl_dentry[i]; d_inode(dentry)->i_private = NULL; - d_drop(dentry); + if (!i) { + /* Get rid of submounts: */ + d_invalidate(dentry); + } dput(dentry); } drop_nlink(d_inode(fuse_control_sb->s_root)); diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index e9e97803442a..8f68181256c0 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -48,6 +48,7 @@ #include <linux/stat.h> #include <linux/module.h> #include <linux/uio.h> +#include <linux/user_namespace.h> #include "fuse_i.h" @@ -406,7 +407,7 @@ err_unlock: err_region: unregister_chrdev_region(devt, 1); err: - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); goto out; } @@ -498,7 +499,11 @@ static int cuse_channel_open(struct inode *inode, struct file *file) if (!cc) return -ENOMEM; - fuse_conn_init(&cc->fc); + /* + * Limit the cuse channel to requests that can + * be represented in file->f_cred->user_ns. + */ + fuse_conn_init(&cc->fc, file->f_cred->user_ns); fud = fuse_dev_alloc(&cc->fc); if (!fud) { @@ -581,7 +586,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, { struct cuse_conn *cc = dev_get_drvdata(dev); - fuse_abort_conn(&cc->fc); + fuse_abort_conn(&cc->fc, false); return count; } static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5d06384c2cae..e03ca14f40e9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -112,13 +112,6 @@ static void __fuse_put_request(struct fuse_req *req) refcount_dec(&req->count); } -static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req) -{ - req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); - req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); -} - void fuse_set_initialized(struct fuse_conn *fc) { /* Make sure stores before this are seen on another CPU */ @@ -163,11 +156,19 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, goto out; } - fuse_req_init_context(fc, req); + req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); + __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); + if (unlikely(req->in.h.uid == ((uid_t)-1) || + req->in.h.gid == ((gid_t)-1))) { + fuse_put_request(fc, req); + return ERR_PTR(-EOVERFLOW); + } return req; out: @@ -256,7 +257,10 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, if (!req) req = get_reserved_req(fc, file); - fuse_req_init_context(fc, req); + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); + __set_bit(FR_WAITING, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); return req; @@ -381,8 +385,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); - if (fc->num_background == fc->congestion_threshold && - fc->connected && fc->sb) { + if (fc->num_background == fc->congestion_threshold && fc->sb) { clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } @@ -1234,9 +1237,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, if (err) goto err_unlock; - err = -ENODEV; - if (!fiq->connected) + if (!fiq->connected) { + err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; goto err_unlock; + } if (!list_empty(&fiq->interrupts)) { req = list_entry(fiq->interrupts.next, struct fuse_req, @@ -1260,12 +1264,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, in = &req->in; reqsize = in->h.len; - if (task_active_pid_ns(current) != fc->pid_ns) { - rcu_read_lock(); - in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)); - rcu_read_unlock(); - } - /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; @@ -1287,7 +1285,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) { - err = -ENODEV; + err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; goto out_end; } if (err) { @@ -2076,7 +2074,7 @@ static void end_polls(struct fuse_conn *fc) * is OK, the request will in that case be removed from the list before we touch * it. */ -void fuse_abort_conn(struct fuse_conn *fc) +void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) { struct fuse_iqueue *fiq = &fc->iq; @@ -2089,6 +2087,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; + fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; @@ -2151,7 +2150,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { WARN_ON(fc->iq.fasync != NULL); - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); } fuse_dev_free(fud); } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 24967382a7b1..56231b31f806 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = make_kuid(&init_user_ns, attr->uid); - stat->gid = make_kgid(&init_user_ns, attr->gid); + stat->uid = make_kuid(fc->user_ns, attr->uid); + stat->gid = make_kgid(fc->user_ns, attr->gid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -924,12 +924,20 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, } static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat) + struct kstat *stat, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; + bool sync; - if (time_before64(fi->i_time, get_jiffies_64())) { + if (flags & AT_STATX_FORCE_SYNC) + sync = true; + else if (flags & AT_STATX_DONT_SYNC) + sync = false; + else + sync = time_before64(fi->i_time, get_jiffies_64()); + + if (sync) { forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { @@ -943,7 +951,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { - return fuse_update_get_attr(inode, file, NULL); + return fuse_update_get_attr(inode, file, NULL, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, @@ -1030,7 +1038,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) const struct cred *cred; if (fc->allow_other) - return 1; + return current_in_userns(fc->user_ns); cred = current_cred(); if (uid_eq(cred->euid, fc->user_id) && @@ -1475,17 +1483,17 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) return true; } -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, - bool trust_local_cmtime) +static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, + struct fuse_setattr_in *arg, bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); + arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); + arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { @@ -1629,8 +1637,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, return err; if (attr->ia_valid & ATTR_OPEN) { - if (fc->atomic_o_trunc) + /* This is coming from open(..., ... | O_TRUNC); */ + WARN_ON(!(attr->ia_valid & ATTR_SIZE)); + WARN_ON(attr->ia_size != 0); + if (fc->atomic_o_trunc) { + /* + * No need to send request to userspace, since actual + * truncation has already been done by OPEN. But still + * need to truncate page cache. + */ + i_size_write(inode, 0); + truncate_pagecache(inode, 0); return 0; + } file = NULL; } @@ -1646,7 +1665,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg, trust_local_cmtime); + iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1783,7 +1802,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_get_attr(inode, NULL, stat); + return fuse_update_get_attr(inode, NULL, stat, flags); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c4c093bbf456..5256ad333b05 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -26,6 +26,7 @@ #include <linux/xattr.h> #include <linux/pid_namespace.h> #include <linux/refcount.h> +#include <linux/user_namespace.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -466,6 +467,9 @@ struct fuse_conn { /** The pid namespace for this mount */ struct pid_namespace *pid_ns; + /** The user namespace for this mount */ + struct user_namespace *user_ns; + /** Maximum read size */ unsigned max_read; @@ -515,6 +519,9 @@ struct fuse_conn { abort and device release */ unsigned connected; + /** Connection aborted via sysfs */ + bool aborted; + /** Connection failed (version mismatch). Cannot race with setting other bitfields since it is only set once in INIT reply, before any other request, and never cleared */ @@ -526,6 +533,9 @@ struct fuse_conn { /** Do readpages asynchronously? Only set in INIT */ unsigned async_read:1; + /** Return an unique read error after abort. Only set in INIT */ + unsigned abort_err:1; + /** Do not send separate SETATTR request before open(O_TRUNC) */ unsigned atomic_o_trunc:1; @@ -851,7 +861,7 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ -void fuse_abort_conn(struct fuse_conn *fc); +void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); /** * Invalidate inode attributes @@ -870,7 +880,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc); +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); /** * Release reference to fuse_conn @@ -975,6 +985,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); extern const struct xattr_handler *fuse_xattr_handlers[]; extern const struct xattr_handler *fuse_acl_xattr_handlers[]; +extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ef309958e060..ffcaf98044b9 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -171,8 +171,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); - inode->i_uid = make_kuid(&init_user_ns, attr->uid); - inode->i_gid = make_kgid(&init_user_ns, attr->gid); + inode->i_uid = make_kuid(fc->user_ns, attr->uid); + inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; @@ -371,7 +371,7 @@ void fuse_unlock_inode(struct inode *inode) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb)); + fuse_abort_conn(get_fuse_conn_super(sb), false); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -393,7 +393,7 @@ static void fuse_put_super(struct super_block *sb) fuse_send_destroy(fc); - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); mutex_lock(&fuse_mutex); list_del(&fc->entry); fuse_ctl_remove_conn(fc); @@ -477,7 +477,8 @@ static int fuse_match_uint(substring_t *s, unsigned int *res) return err; } -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, + struct user_namespace *user_ns) { char *p; memset(d, 0, sizeof(struct fuse_mount_data)); @@ -513,7 +514,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_USER_ID: if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), uv); + d->user_id = make_kuid(user_ns, uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; @@ -522,7 +523,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_GROUP_ID: if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), uv); + d->group_id = make_kgid(user_ns, uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; @@ -565,8 +566,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); + seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); if (fc->default_permissions) seq_puts(m, ",default_permissions"); if (fc->allow_other) @@ -597,7 +598,7 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc) +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); @@ -621,6 +622,7 @@ void fuse_conn_init(struct fuse_conn *fc) fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); + fc->user_ns = get_user_ns(user_ns); } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -630,6 +632,7 @@ void fuse_conn_put(struct fuse_conn *fc) if (fc->destroy_req) fuse_request_free(fc->destroy_req); put_pid_ns(fc->pid_ns); + put_user_ns(fc->user_ns); fc->release(fc); } } @@ -918,6 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->posix_acl = 1; fc->sb->s_xattr = fuse_acl_xattr_handlers; } + if (arg->flags & FUSE_ABORT_ERROR) + fc->abort_err = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -948,7 +953,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | - FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL; + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | + FUSE_ABORT_ERROR; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -1061,7 +1067,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (!parse_fuse_opt(data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) goto err; if (is_bdev) { @@ -1089,16 +1095,27 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) goto err; - if ((file->f_op != &fuse_dev_operations) || - (file->f_cred->user_ns != &init_user_ns)) + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if (file->f_op != &fuse_dev_operations || + file->f_cred->user_ns != sb->s_user_ns) goto err_fput; + /* + * If we are not in the initial user namespace posix + * acls must be translated. + */ + if (sb->s_user_ns != &init_user_ns) + sb->s_xattr = fuse_no_acl_xattr_handlers; + fc = kmalloc(sizeof(*fc), GFP_KERNEL); err = -ENOMEM; if (!fc) goto err_fput; - fuse_conn_init(fc); + fuse_conn_init(fc, sb->s_user_ns); fc->release = fuse_free_conn; fud = fuse_dev_alloc(fc); @@ -1179,6 +1196,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_dev_free(fud); err_put_conn: fuse_conn_put(fc); + sb->s_fs_info = NULL; err_fput: fput(file); err: @@ -1208,7 +1226,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, .mount = fuse_mount, .kill_sb = fuse_kill_sb_anon, }; diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 3caac46b08b0..433717640f78 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -192,6 +192,26 @@ static int fuse_xattr_set(const struct xattr_handler *handler, return fuse_setxattr(inode, name, value, size, flags); } +static bool no_xattr_list(struct dentry *dentry) +{ + return false; +} + +static int no_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + return -EOPNOTSUPP; +} + +static int no_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *nodee, + const char *name, const void *value, + size_t size, int flags) +{ + return -EOPNOTSUPP; +} + static const struct xattr_handler fuse_xattr_handler = { .prefix = "", .get = fuse_xattr_get, @@ -209,3 +229,26 @@ const struct xattr_handler *fuse_acl_xattr_handlers[] = { &fuse_xattr_handler, NULL }; + +static const struct xattr_handler fuse_no_acl_access_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = no_xattr_list, + .get = no_xattr_get, + .set = no_xattr_set, +}; + +static const struct xattr_handler fuse_no_acl_default_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = ACL_TYPE_ACCESS, + .list = no_xattr_list, + .get = no_xattr_get, + .set = no_xattr_set, +}; + +const struct xattr_handler *fuse_no_acl_xattr_handlers[] = { + &fuse_no_acl_access_xattr_handler, + &fuse_no_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index a7b586e02693..ed6699705c13 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -767,10 +767,11 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) sizeof(struct gfs2_dinode); iomap->offset = 0; iomap->length = i_size_read(inode); - iomap->type = IOMAP_MAPPED; - iomap->flags = IOMAP_F_DATA_INLINE; + iomap->type = IOMAP_INLINE; } +#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE + /** * gfs2_iomap_get - Map blocks from an inode to disk blocks * @inode: The inode @@ -846,7 +847,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, iomap->type = IOMAP_MAPPED; iomap->flags = IOMAP_F_MERGED; if (eob) - iomap->flags |= IOMAP_F_BOUNDARY; + iomap->flags |= IOMAP_F_GFS2_BOUNDARY; out: iomap->bdev = inode->i_sb->s_bdev; @@ -952,12 +953,12 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, if (iomap.length > bh_map->b_size) { iomap.length = bh_map->b_size; - iomap.flags &= ~IOMAP_F_BOUNDARY; + iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY; } if (iomap.addr != IOMAP_NULL_ADDR) map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits); bh_map->b_size = iomap.length; - if (iomap.flags & IOMAP_F_BOUNDARY) + if (iomap.flags & IOMAP_F_GFS2_BOUNDARY) set_buffer_boundary(bh_map); if (iomap.flags & IOMAP_F_NEW) set_buffer_new(bh_map); diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 2577ef1034ef..2a153aed4c19 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -26,8 +26,7 @@ #include "hpfs.h" #define EIOERROR EIO -#define EFSERROR EPERM -#define EMEMERROR ENOMEM +#define EFSERROR EUCLEAN #define ANODE_ALLOC_FWD 512 #define FNODE_ALLOC_FWD 0 diff --git a/fs/inode.c b/fs/inode.c index 3b55391072f3..0df41bb77e0f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1004,6 +1004,70 @@ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) EXPORT_SYMBOL(unlock_two_nondirectories); /** + * inode_insert5 - obtain an inode from a mounted file system + * @inode: pre-allocated inode to use for insert to cache + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * Search for the inode specified by @hashval and @data in the inode cache, + * and if present it is return it with an increased reference count. This is + * a variant of iget5_locked() for callers that don't want to fail on memory + * allocation of inode. + * + * If the inode is not in cache, insert the pre-allocated inode to cache and + * return it locked, hashed, and with the I_NEW flag set. The file system gets + * to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_hash_lock held, so can't + * sleep. + */ +struct inode *inode_insert5(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); + struct inode *old; + +again: + spin_lock(&inode_hash_lock); + old = find_inode(inode->i_sb, head, test, data); + if (unlikely(old)) { + /* + * Uhhuh, somebody else created the same inode under us. + * Use the old inode instead of the preallocated one. + */ + spin_unlock(&inode_hash_lock); + wait_on_inode(old); + if (unlikely(inode_unhashed(old))) { + iput(old); + goto again; + } + return old; + } + + if (set && unlikely(set(inode, data))) { + inode = NULL; + goto unlock; + } + + /* + * Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + spin_lock(&inode->i_lock); + inode->i_state |= I_NEW; + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode->i_lock); +unlock: + spin_unlock(&inode_hash_lock); + + return inode; +} +EXPORT_SYMBOL(inode_insert5); + +/** * iget5_locked - obtain an inode from a mounted file system * @sb: super block of file system * @hashval: hash value (usually inode number) to get @@ -1027,66 +1091,18 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); - struct inode *inode; -again: - spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data); - spin_unlock(&inode_hash_lock); + struct inode *inode = ilookup5(sb, hashval, test, data); - if (inode) { - wait_on_inode(inode); - if (unlikely(inode_unhashed(inode))) { - iput(inode); - goto again; - } - return inode; - } + if (!inode) { + struct inode *new = new_inode(sb); - inode = alloc_inode(sb); - if (inode) { - struct inode *old; - - spin_lock(&inode_hash_lock); - /* We released the lock, so.. */ - old = find_inode(sb, head, test, data); - if (!old) { - if (set(inode, data)) - goto set_failed; - - spin_lock(&inode->i_lock); - inode->i_state = I_NEW; - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode->i_lock); - inode_sb_list_add(inode); - spin_unlock(&inode_hash_lock); - - /* Return the locked inode with I_NEW set, the - * caller is responsible for filling in the contents - */ - return inode; - } - - /* - * Uhhuh, somebody else created the same inode under - * us. Use the old inode instead of the one we just - * allocated. - */ - spin_unlock(&inode_hash_lock); - destroy_inode(inode); - inode = old; - wait_on_inode(inode); - if (unlikely(inode_unhashed(inode))) { - iput(inode); - goto again; + if (new) { + inode = inode_insert5(new, hashval, test, set, data); + if (unlikely(inode != new)) + iput(new); } } return inode; - -set_failed: - spin_unlock(&inode_hash_lock); - destroy_inode(inode); - return NULL; } EXPORT_SYMBOL(iget5_locked); @@ -1427,43 +1443,13 @@ EXPORT_SYMBOL(insert_inode_locked); int insert_inode_locked4(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct super_block *sb = inode->i_sb; - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *old = inode_insert5(inode, hashval, test, NULL, data); - while (1) { - struct inode *old = NULL; - - spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, head, i_hash) { - if (old->i_sb != sb) - continue; - if (!test(old, data)) - continue; - spin_lock(&old->i_lock); - if (old->i_state & (I_FREEING|I_WILL_FREE)) { - spin_unlock(&old->i_lock); - continue; - } - break; - } - if (likely(!old)) { - spin_lock(&inode->i_lock); - inode->i_state |= I_NEW; - hlist_add_head(&inode->i_hash, head); - spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); - return 0; - } - __iget(old); - spin_unlock(&old->i_lock); - spin_unlock(&inode_hash_lock); - wait_on_inode(old); - if (unlikely(!inode_unhashed(old))) { - iput(old); - return -EBUSY; - } + if (old != inode) { iput(old); + return -EBUSY; } + return 0; } EXPORT_SYMBOL(insert_inode_locked4); diff --git a/fs/iomap.c b/fs/iomap.c index afd163586aa0..7d1e9f45f098 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/backing-dev.h> @@ -27,6 +28,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/dax.h> #include <linux/sched/signal.h> +#include <linux/swap.h> #include "internal.h" @@ -95,6 +97,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, return written ? written : ret; } +static sector_t +iomap_sector(struct iomap *iomap, loff_t pos) +{ + return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; +} + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -352,11 +360,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { - sector_t sector = (iomap->addr + - (pos & PAGE_MASK) - iomap->offset) >> 9; - - return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, - offset, bytes); + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, + iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); } static loff_t @@ -501,10 +506,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, case IOMAP_DELALLOC: flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; break; + case IOMAP_MAPPED: + break; case IOMAP_UNWRITTEN: flags |= FIEMAP_EXTENT_UNWRITTEN; break; - case IOMAP_MAPPED: + case IOMAP_INLINE: + flags |= FIEMAP_EXTENT_DATA_INLINE; break; } @@ -512,8 +520,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; - if (iomap->flags & IOMAP_F_DATA_INLINE) - flags |= FIEMAP_EXTENT_DATA_INLINE; return fiemap_fill_next_extent(fi, iomap->offset, iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, @@ -587,6 +593,113 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } EXPORT_SYMBOL_GPL(iomap_fiemap); +/* + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. + * Returns true if found and updates @lastoff to the offset in file. + */ +static bool +page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, + int whence) +{ + const struct address_space_operations *ops = inode->i_mapping->a_ops; + unsigned int bsize = i_blocksize(inode), off; + bool seek_data = whence == SEEK_DATA; + loff_t poff = page_offset(page); + + if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) + return false; + + if (*lastoff < poff) { + /* + * Last offset smaller than the start of the page means we found + * a hole: + */ + if (whence == SEEK_HOLE) + return true; + *lastoff = poff; + } + + /* + * Just check the page unless we can and should check block ranges: + */ + if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) + return PageUptodate(page) == seek_data; + + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping)) + goto out_unlock_not_found; + + for (off = 0; off < PAGE_SIZE; off += bsize) { + if ((*lastoff & ~PAGE_MASK) >= off + bsize) + continue; + if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { + unlock_page(page); + return true; + } + *lastoff = poff + off + bsize; + } + +out_unlock_not_found: + unlock_page(page); + return false; +} + +/* + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * + * Within unwritten extents, the page cache determines which parts are holes + * and which are data: uptodate buffer heads count as data; everything else + * counts as a hole. + * + * Returns the resulting offset on successs, and -ENOENT otherwise. + */ +static loff_t +page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, + int whence) +{ + pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); + loff_t lastoff = offset; + struct pagevec pvec; + + if (length <= 0) + return -ENOENT; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i; + + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, + end - 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page_seek_hole_data(inode, page, &lastoff, whence)) + goto check_range; + lastoff = page_offset(page) + PAGE_SIZE; + } + pagevec_release(&pvec); + } while (index < end); + + /* When no page at lastoff and we are not done, we found a hole. */ + if (whence != SEEK_HOLE) + goto not_found; + +check_range: + if (lastoff < offset + length) + goto out; +not_found: + lastoff = -ENOENT; +out: + pagevec_release(&pvec); + return lastoff; +} + + static loff_t iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, void *data, struct iomap *iomap) @@ -685,6 +798,8 @@ EXPORT_SYMBOL_GPL(iomap_seek_data); * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_WRITE_FUA (1 << 28) +#define IOMAP_DIO_NEED_SYNC (1 << 29) #define IOMAP_DIO_WRITE (1 << 30) #define IOMAP_DIO_DIRTY (1 << 31) @@ -759,6 +874,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) dio_warn_stale_pagecache(iocb->ki_filp); } + /* + * If this is a DSYNC write, make sure we push it to stable storage now + * that we've written data. + */ + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) + ret = generic_write_sync(iocb, ret); + inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); @@ -769,13 +891,8 @@ static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; - bool is_write = (dio->flags & IOMAP_DIO_WRITE); - ssize_t ret; - ret = iomap_dio_complete(dio); - if (is_write && ret > 0) - ret = generic_write_sync(iocb, ret); - iocb->ki_complete(iocb, ret, 0); + iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); } /* @@ -833,14 +950,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio = bio_alloc(GFP_KERNEL, 1); bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = - (iomap->addr + pos - iomap->offset) >> 9; + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; get_page(page); - if (bio_add_page(bio, page, len, 0) != len) - BUG(); + __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); atomic_inc(&dio->ref); @@ -858,6 +973,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, struct iov_iter iter; struct bio *bio; bool need_zeroout = false; + bool use_fua = false; int nr_pages, ret; size_t copied = 0; @@ -881,8 +997,20 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, case IOMAP_MAPPED: if (iomap->flags & IOMAP_F_SHARED) dio->flags |= IOMAP_DIO_COW; - if (iomap->flags & IOMAP_F_NEW) + if (iomap->flags & IOMAP_F_NEW) { need_zeroout = true; + } else { + /* + * Use a FUA write if we need datasync semantics, this + * is a pure data IO that doesn't require any metadata + * updates and the underlying device supports FUA. This + * allows us to avoid cache flushes on IO completion. + */ + if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && + (dio->flags & IOMAP_DIO_WRITE_FUA) && + blk_queue_fua(bdev_get_queue(iomap->bdev))) + use_fua = true; + } break; default: WARN_ON_ONCE(1); @@ -916,9 +1044,9 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = - (iomap->addr + pos - iomap->offset) >> 9; + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_write_hint = dio->iocb->ki_hint; + bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -930,10 +1058,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + if (use_fua) + bio->bi_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_FUA; task_io_account_write(n); } else { - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } @@ -961,6 +1093,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return copied; } +/* + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO + * is being issued as AIO or not. This allows us to optimise pure data writes + * to use REQ_FUA rather than requiring generic_write_sync() to issue a + * REQ_FLUSH post write. This is slightly tricky because a single request here + * can be mapped into multiple disjoint IOs and only a subset of the IOs issued + * may be pure data writes. In that case, we still need to do a full data sync + * completion. + */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, iomap_dio_end_io_t end_io) @@ -1005,8 +1146,21 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iter->type == ITER_IOVEC) dio->flags |= IOMAP_DIO_DIRTY; } else { - dio->flags |= IOMAP_DIO_WRITE; flags |= IOMAP_WRITE; + dio->flags |= IOMAP_DIO_WRITE; + + /* for data sync or sync, we need sync completion processing */ + if (iocb->ki_flags & IOCB_DSYNC) + dio->flags |= IOMAP_DIO_NEED_SYNC; + + /* + * For datasync only writes, we optimistically try using FUA for + * this IO. Any non-FUA write that occurs will clear this flag, + * hence we know before completion whether a cache flush is + * necessary. + */ + if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) + dio->flags |= IOMAP_DIO_WRITE_FUA; } if (iocb->ki_flags & IOCB_NOWAIT) { @@ -1062,6 +1216,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret < 0) iomap_dio_set_error(dio, ret); + /* + * If all the writes we issued were FUA, we don't need to flush the + * cache on IO completion. Clear the sync flag for this case. + */ + if (dio->flags & IOMAP_DIO_WRITE_FUA) + dio->flags &= ~IOMAP_DIO_NEED_SYNC; + if (!atomic_dec_and_test(&dio->ref)) { if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; @@ -1089,3 +1250,203 @@ out_free_dio: return ret; } EXPORT_SYMBOL_GPL(iomap_dio_rw); + +/* Swapfile activation */ + +#ifdef CONFIG_SWAP +struct iomap_swapfile_info { + struct iomap iomap; /* accumulated iomap */ + struct swap_info_struct *sis; + uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ + uint64_t highest_ppage; /* highest physical addr seen (pages) */ + unsigned long nr_pages; /* number of pages collected */ + int nr_extents; /* extent count */ +}; + +/* + * Collect physical extents for this swap file. Physical extents reported to + * the swap code must be trimmed to align to a page boundary. The logical + * offset within the file is irrelevant since the swapfile code maps logical + * page numbers of the swap device to the physical page-aligned extents. + */ +static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) +{ + struct iomap *iomap = &isi->iomap; + unsigned long nr_pages; + uint64_t first_ppage; + uint64_t first_ppage_reported; + uint64_t next_ppage; + int error; + + /* + * Round the start up and the end down so that the physical + * extent aligns to a page boundary. + */ + first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> + PAGE_SHIFT; + + /* Skip too-short physical extents. */ + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + + /* + * Calculate how much swap space we're adding; the first page contains + * the swap header and doesn't count. The mm still wants that first + * page fed to add_swap_extent, however. + */ + first_ppage_reported = first_ppage; + if (iomap->offset == 0) + first_ppage_reported++; + if (isi->lowest_ppage > first_ppage_reported) + isi->lowest_ppage = first_ppage_reported; + if (isi->highest_ppage < (next_ppage - 1)) + isi->highest_ppage = next_ppage - 1; + + /* Add extent, set up for the next call. */ + error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); + if (error < 0) + return error; + isi->nr_extents += error; + isi->nr_pages += nr_pages; + return 0; +} + +/* + * Accumulate iomaps for this swap file. We have to accumulate iomaps because + * swap only cares about contiguous page-aligned physical extents and makes no + * distinction between written and unwritten extents. + */ +static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, + loff_t count, void *data, struct iomap *iomap) +{ + struct iomap_swapfile_info *isi = data; + int error; + + switch (iomap->type) { + case IOMAP_MAPPED: + case IOMAP_UNWRITTEN: + /* Only real or unwritten extents. */ + break; + case IOMAP_INLINE: + /* No inline data. */ + pr_err("swapon: file is inline\n"); + return -EINVAL; + default: + pr_err("swapon: file has unallocated extents\n"); + return -EINVAL; + } + + /* No uncommitted metadata or shared blocks. */ + if (iomap->flags & IOMAP_F_DIRTY) { + pr_err("swapon: file is not committed\n"); + return -EINVAL; + } + if (iomap->flags & IOMAP_F_SHARED) { + pr_err("swapon: file has shared extents\n"); + return -EINVAL; + } + + /* Only one bdev per swap file. */ + if (iomap->bdev != isi->sis->bdev) { + pr_err("swapon: file is on multiple devices\n"); + return -EINVAL; + } + + if (isi->iomap.length == 0) { + /* No accumulated extent, so just store it. */ + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); + } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { + /* Append this to the accumulated extent. */ + isi->iomap.length += iomap->length; + } else { + /* Otherwise, add the retained iomap and store this one. */ + error = iomap_swapfile_add_extent(isi); + if (error) + return error; + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); + } + return count; +} + +/* + * Iterate a swap file's iomaps to construct physical extents that can be + * passed to the swapfile subsystem. + */ +int iomap_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *pagespan, + const struct iomap_ops *ops) +{ + struct iomap_swapfile_info isi = { + .sis = sis, + .lowest_ppage = (sector_t)-1ULL, + }; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = 0; + loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); + loff_t ret; + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + return ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_REPORT, + ops, &isi, iomap_swapfile_activate_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + if (isi.iomap.length) { + ret = iomap_swapfile_add_extent(&isi); + if (ret) + return ret; + } + + *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; + sis->max = isi.nr_pages; + sis->pages = isi.nr_pages - 1; + sis->highest_bit = isi.nr_pages - 1; + return isi.nr_extents; +} +EXPORT_SYMBOL_GPL(iomap_swapfile_activate); +#endif /* CONFIG_SWAP */ + +static loff_t +iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + sector_t *bno = data, addr; + + if (iomap->type == IOMAP_MAPPED) { + addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; + if (addr > INT_MAX) + WARN(1, "would truncate bmap result\n"); + else + *bno = addr; + } + return 0; +} + +/* legacy ->bmap interface. 0 is the error return (!) */ +sector_t +iomap_bmap(struct address_space *mapping, sector_t bno, + const struct iomap_ops *ops) +{ + struct inode *inode = mapping->host; + loff_t pos = bno >> inode->i_blkbits; + unsigned blocksize = i_blocksize(inode); + + if (filemap_write_and_wait(mapping)) + return 0; + + bno = 0; + iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); + return bno; +} +EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index dfb057900e79..8ef6b6daaa7a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -114,7 +114,7 @@ void __jbd2_debug(int level, const char *file, const char *func, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf); + printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); va_end(args); } EXPORT_SYMBOL(__jbd2_debug); @@ -2302,8 +2302,7 @@ static void jbd2_journal_destroy_slabs(void) int i; for (i = 0; i < JBD2_MAX_SLABS; i++) { - if (jbd2_slab[i]) - kmem_cache_destroy(jbd2_slab[i]); + kmem_cache_destroy(jbd2_slab[i]); jbd2_slab[i] = NULL; } } @@ -2404,10 +2403,8 @@ static int jbd2_journal_init_journal_head_cache(void) static void jbd2_journal_destroy_journal_head_cache(void) { - if (jbd2_journal_head_cache) { - kmem_cache_destroy(jbd2_journal_head_cache); - jbd2_journal_head_cache = NULL; - } + kmem_cache_destroy(jbd2_journal_head_cache); + jbd2_journal_head_cache = NULL; } /* @@ -2665,11 +2662,10 @@ static int __init jbd2_journal_init_handle_cache(void) static void jbd2_journal_destroy_handle_cache(void) { - if (jbd2_handle_cache) - kmem_cache_destroy(jbd2_handle_cache); - if (jbd2_inode_cache) - kmem_cache_destroy(jbd2_inode_cache); - + kmem_cache_destroy(jbd2_handle_cache); + jbd2_handle_cache = NULL; + kmem_cache_destroy(jbd2_inode_cache); + jbd2_inode_cache = NULL; } /* diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 696ef15ec942..240779e4689c 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -180,14 +180,10 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, void jbd2_journal_destroy_revoke_caches(void) { - if (jbd2_revoke_record_cache) { - kmem_cache_destroy(jbd2_revoke_record_cache); - jbd2_revoke_record_cache = NULL; - } - if (jbd2_revoke_table_cache) { - kmem_cache_destroy(jbd2_revoke_table_cache); - jbd2_revoke_table_cache = NULL; - } + kmem_cache_destroy(jbd2_revoke_record_cache); + jbd2_revoke_record_cache = NULL; + kmem_cache_destroy(jbd2_revoke_table_cache); + jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_caches(void) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8aa453784402..51dd68e67b0f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -49,10 +49,8 @@ int __init jbd2_journal_init_transaction_cache(void) void jbd2_journal_destroy_transaction_cache(void) { - if (transaction_cache) { - kmem_cache_destroy(transaction_cache); - transaction_cache = NULL; - } + kmem_cache_destroy(transaction_cache); + transaction_cache = NULL; } void jbd2_journal_free_transaction(transaction_t *transaction) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index fd5ce883072e..2015d8c45e4a 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -348,11 +348,11 @@ static void kernfs_vma_open(struct vm_area_struct *vma) kernfs_put_active(of->kn); } -static int kernfs_vma_fault(struct vm_fault *vmf) +static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); - int ret; + vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; @@ -368,11 +368,11 @@ static int kernfs_vma_fault(struct vm_fault *vmf) return ret; } -static int kernfs_vma_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); - int ret; + vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 97a972efab83..68728de12864 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -788,35 +788,34 @@ static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, spin_unlock(&lockres->l_lock); } -static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, - struct ocfs2_lock_holder *oh) -{ - spin_lock(&lockres->l_lock); - list_del(&oh->oh_list); - spin_unlock(&lockres->l_lock); - - put_pid(oh->oh_owner_pid); -} - -static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres) +static struct ocfs2_lock_holder * +ocfs2_pid_holder(struct ocfs2_lock_res *lockres, + struct pid *pid) { struct ocfs2_lock_holder *oh; - struct pid *pid; - /* look in the list of holders for one with the current task as owner */ spin_lock(&lockres->l_lock); - pid = task_pid(current); list_for_each_entry(oh, &lockres->l_holders, oh_list) { if (oh->oh_owner_pid == pid) { spin_unlock(&lockres->l_lock); - return 1; + return oh; } } spin_unlock(&lockres->l_lock); + return NULL; +} - return 0; +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, + struct ocfs2_lock_holder *oh) +{ + spin_lock(&lockres->l_lock); + list_del(&oh->oh_list); + spin_unlock(&lockres->l_lock); + + put_pid(oh->oh_owner_pid); } + static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) { @@ -2610,34 +2609,93 @@ void ocfs2_inode_unlock(struct inode *inode, * * return < 0 on error, return == 0 if there's no lock holder on the stack * before this call, return == 1 if this call would be a recursive locking. + * return == -1 if this lock attempt will cause an upgrade which is forbidden. + * + * When taking lock levels into account,we face some different situations. + * + * 1. no lock is held + * In this case, just lock the inode as requested and return 0 + * + * 2. We are holding a lock + * For this situation, things diverges into several cases + * + * wanted holding what to do + * ex ex see 2.1 below + * ex pr see 2.2 below + * pr ex see 2.1 below + * pr pr see 2.1 below + * + * 2.1 lock level that is been held is compatible + * with the wanted level, so no lock action will be tacken. + * + * 2.2 Otherwise, an upgrade is needed, but it is forbidden. + * + * Reason why upgrade within a process is forbidden is that + * lock upgrade may cause dead lock. The following illustrates + * how it happens. + * + * thread on node1 thread on node2 + * ocfs2_inode_lock_tracker(ex=0) + * + * <====== ocfs2_inode_lock_tracker(ex=1) + * + * ocfs2_inode_lock_tracker(ex=1) */ int ocfs2_inode_lock_tracker(struct inode *inode, struct buffer_head **ret_bh, int ex, struct ocfs2_lock_holder *oh) { - int status; - int arg_flags = 0, has_locked; + int status = 0; struct ocfs2_lock_res *lockres; + struct ocfs2_lock_holder *tmp_oh; + struct pid *pid = task_pid(current); + lockres = &OCFS2_I(inode)->ip_inode_lockres; - has_locked = ocfs2_is_locked_by_me(lockres); - /* Just get buffer head if the cluster lock has been taken */ - if (has_locked) - arg_flags = OCFS2_META_LOCK_GETBH; + tmp_oh = ocfs2_pid_holder(lockres, pid); - if (likely(!has_locked || ret_bh)) { - status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags); + if (!tmp_oh) { + /* + * This corresponds to the case 1. + * We haven't got any lock before. + */ + status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); return status; } - } - if (!has_locked) + + oh->oh_ex = ex; ocfs2_add_holder(lockres, oh); + return 0; + } - return has_locked; + if (unlikely(ex && !tmp_oh->oh_ex)) { + /* + * case 2.2 upgrade may cause dead lock, forbid it. + */ + mlog(ML_ERROR, "Recursive locking is not permitted to " + "upgrade to EX level from PR level.\n"); + dump_stack(); + return -EINVAL; + } + + /* + * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full. + * ignore the lock level and just update it. + */ + if (ret_bh) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, + OCFS2_META_LOCK_GETBH); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + } + return tmp_oh ? 1 : 0; } void ocfs2_inode_unlock_tracker(struct inode *inode, @@ -2649,12 +2707,13 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, lockres = &OCFS2_I(inode)->ip_inode_lockres; /* had_lock means that the currect process already takes the cluster - * lock previously. If had_lock is 1, we have nothing to do here, and - * it will get unlocked where we got the lock. + * lock previously. + * If had_lock is 1, we have nothing to do here. + * If had_lock is 0, we will release the lock. */ if (!had_lock) { + ocfs2_inode_unlock(inode, oh->oh_ex); ocfs2_remove_holder(lockres, oh); - ocfs2_inode_unlock(inode, ex); } } diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 256e0a9067b8..4ec1c828f6e0 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -96,6 +96,7 @@ struct ocfs2_trim_fs_info { struct ocfs2_lock_holder { struct list_head oh_list; struct pid *oh_owner_pid; + int oh_ex; }; /* ocfs2_inode_lock_full() 'arg_flags' flags */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6ee94bc23f5b..a2a8603d27e0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -563,8 +563,8 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, return ret; } -static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) +static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) { int status = 0; int restart_func = 0; @@ -1035,8 +1035,8 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, clusters_to_add -= oi->ip_clusters; if (clusters_to_add) { - ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, - clusters_to_add, 0); + ret = ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); if (ret) { mlog_errno(ret); goto out; @@ -1493,7 +1493,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, goto next; } - ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 1fdc9839cd93..7eb7f03531f6 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -65,8 +65,6 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index ab30c005cc4b..994726ada857 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -402,7 +402,7 @@ out_err: static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, unsigned int chunksize) { - int index; + u32 index; index = __ilog2_u32(chunksize); if (index >= OCFS2_INFO_MAX_HIST) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index fb9a20e3d608..05220b365fb9 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -44,11 +44,11 @@ #include "ocfs2_trace.h" -static int ocfs2_fault(struct vm_fault *vmf) +static vm_fault_t ocfs2_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; sigset_t oldset; - int ret; + vm_fault_t ret; ocfs2_block_signals(&oldset); ret = filemap_fault(vmf); @@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_fault *vmf) return ret; } -static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, - struct page *page) +static vm_fault_t __ocfs2_page_mkwrite(struct file *file, + struct buffer_head *di_bh, struct page *page) { - int ret = VM_FAULT_NOPAGE; + int err; + vm_fault_t ret = VM_FAULT_NOPAGE; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); @@ -105,15 +106,12 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, if (page->index == last_index) len = ((size - 1) & ~PAGE_MASK) + 1; - ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, + err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, &locked_page, &fsdata, di_bh, page); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + if (err) { + if (err != -ENOSPC) + mlog_errno(err); + ret = vmf_error(err); goto out; } @@ -121,20 +119,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); - BUG_ON(ret != len); + err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); + BUG_ON(err != len); ret = VM_FAULT_LOCKED; out: return ret; } -static int ocfs2_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; - int ret; + int err; + vm_fault_t ret; sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); @@ -144,13 +143,10 @@ static int ocfs2_page_mkwrite(struct vm_fault *vmf) * node. Taking the data lock will also ensure that we don't * attempt page truncation as part of a downconvert. */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + err = ocfs2_inode_lock(inode, &di_bh, 1); + if (err < 0) { + mlog_errno(err); + ret = vmf_error(err); goto out; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8dd6f703c819..b7ca84bc3df7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2332,8 +2332,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio) { - const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; - char name[namelen + 1]; + char name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 5bb4a89f9045..7071ad0dec90 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -807,11 +807,11 @@ struct ocfs2_dir_block_trailer { * in this block. (unused) */ /*10*/ __u8 db_signature[8]; /* Signature for verification */ __le64 db_reserved2; - __le64 db_free_next; /* Next block in list (unused) */ -/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ - __le64 db_parent_dinode; /* dinode which owns me, in +/*20*/ __le64 db_free_next; /* Next block in list (unused) */ + __le64 db_blkno; /* Offset on disk, in blocks */ +/*30*/ __le64 db_parent_dinode; /* dinode which owns me, in blocks */ -/*30*/ struct ocfs2_block_check db_check; /* Error checking */ + struct ocfs2_block_check db_check; /* Error checking */ /*40*/ }; diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index 66369ec90020..74b37cbbd5d4 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -281,14 +281,17 @@ restart: ret = copy_to_user(buf, &proto_ver, sizeof(__s32)); if (ret != 0) goto error; - ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32)); + ret = copy_to_user(buf + sizeof(__s32), &magic, sizeof(__s32)); if (ret != 0) goto error; - ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64)); + ret = copy_to_user(buf + 2 * sizeof(__s32), + &cur_op->tag, + sizeof(__u64)); if (ret != 0) goto error; - ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall, - sizeof(struct orangefs_upcall_s)); + ret = copy_to_user(buf + 2 * sizeof(__s32) + sizeof(__u64), + &cur_op->upcall, + sizeof(struct orangefs_upcall_s)); if (ret != 0) goto error; @@ -381,7 +384,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, (unsigned int) MAX_DEV_REQ_DOWNSIZE); return -EFAULT; } - + if (!copy_from_iter_full(&head, head_size, iter)) { gossip_err("%s: failed to copy head.\n", __func__); return -EFAULT; @@ -426,7 +429,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, goto wakeup; /* - * We've successfully peeled off the head and the downcall. + * We've successfully peeled off the head and the downcall. * Something has gone awry if total doesn't equal the * sum of head_size, downcall_size and trailer_size. */ @@ -477,7 +480,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, wakeup: /* * Return to vfs waitqueue, and back to service_operation - * through wait_for_matching_downcall. + * through wait_for_matching_downcall. */ spin_lock(&op->lock); if (unlikely(op_is_cancel(op))) { diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 26358efbf794..db0b52187cbc 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -162,7 +162,7 @@ populate_shared_memory: else ret = 0; break; - /* + /* * If the op was in progress when the interrupt * occurred, then the client-core was able to * trigger the write. @@ -544,7 +544,7 @@ static int orangefs_fault(struct vm_fault *vmf) return filemap_fault(vmf); } -const struct vm_operations_struct orangefs_file_vm_ops = { +static const struct vm_operations_struct orangefs_file_vm_ops = { .fault = orangefs_fault, .map_pages = filemap_map_pages, .page_mkwrite = filemap_page_mkwrite, diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 79c61da8b1bc..d6db252e6200 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -20,8 +20,8 @@ static int read_one_page(struct page *page) int max_block; ssize_t bytes_read = 0; struct inode *inode = page->mapping->host; - const __u32 blocksize = PAGE_SIZE; /* inode->i_blksize */ - const __u32 blockbits = PAGE_SHIFT; /* inode->i_blkbits */ + const __u32 blocksize = PAGE_SIZE; + const __u32 blockbits = PAGE_SHIFT; struct iov_iter to; struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE}; @@ -181,16 +181,15 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) new_op->upcall.req.truncate.refn = orangefs_inode->refn; new_op->upcall.req.truncate.size = (__s64) iattr->ia_size; - ret = service_operation(new_op, __func__, - get_interruptible_flag(inode)); + ret = service_operation(new_op, + __func__, + get_interruptible_flag(inode)); /* * the truncate has no downcall members to retrieve, but * the status value tells us if it went through ok or not */ - gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs: orangefs_truncate got return value of %d\n", - ret); + gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); op_release(new_op); @@ -212,8 +211,9 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) struct inode *inode = dentry->d_inode; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_setattr: called on %pd\n", - dentry); + "%s: called on %pd\n", + __func__, + dentry); ret = setattr_prepare(dentry, iattr); if (ret) @@ -230,15 +230,16 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) ret = orangefs_inode_setattr(inode, iattr); gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_setattr: inode_setattr returned %d\n", - ret); + "%s: orangefs_inode_setattr returned %d\n", + __func__, + ret); if (!ret && (iattr->ia_valid & ATTR_MODE)) /* change mod on a file that has ACLs */ ret = posix_acl_chmod(inode, inode->i_mode); out: - gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", ret); + gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); return ret; } @@ -262,13 +263,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, /* override block size reported to stat */ orangefs_inode = ORANGEFS_I(inode); - stat->blksize = orangefs_inode->blksize; if (request_mask & STATX_SIZE) stat->result_mask = STATX_BASIC_STATS; else stat->result_mask = STATX_BASIC_STATS & ~STATX_SIZE; + + stat->attributes_mask = STATX_ATTR_IMMUTABLE | + STATX_ATTR_APPEND; + if (inode->i_flags & S_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (inode->i_flags & S_APPEND) + stat->attributes |= STATX_ATTR_APPEND; } return ret; } @@ -306,7 +313,7 @@ int orangefs_update_time(struct inode *inode, struct timespec *time, int flags) return orangefs_inode_setattr(inode, &iattr); } -/* ORANGEDS2 implementation of VFS inode operations for files */ +/* ORANGEFS2 implementation of VFS inode operations for files */ static const struct inode_operations orangefs_file_inode_operations = { .get_acl = orangefs_get_acl, .set_acl = orangefs_set_acl, @@ -325,7 +332,6 @@ static int orangefs_init_iops(struct inode *inode) case S_IFREG: inode->i_op = &orangefs_file_inode_operations; inode->i_fop = &orangefs_file_operations; - inode->i_blkbits = PAGE_SHIFT; break; case S_IFLNK: inode->i_op = &orangefs_symlink_inode_operations; @@ -345,8 +351,8 @@ static int orangefs_init_iops(struct inode *inode) } /* - * Given a ORANGEFS object identifier (fsid, handle), convert it into a ino_t type - * that will be used as a hash-index from where the handle will + * Given an ORANGEFS object identifier (fsid, handle), convert it into + * a ino_t type that will be used as a hash-index from where the handle will * be searched for in the VFS hash table of inodes. */ static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref) @@ -376,8 +382,10 @@ static int orangefs_test_inode(struct inode *inode, void *data) struct orangefs_inode_s *orangefs_inode = NULL; orangefs_inode = ORANGEFS_I(inode); - return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), &(ref->khandle)) - && orangefs_inode->refn.fs_id == ref->fs_id); + /* test handles and fs_ids... */ + return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), + &(ref->khandle)) && + orangefs_inode->refn.fs_id == ref->fs_id); } /* @@ -385,16 +393,21 @@ static int orangefs_test_inode(struct inode *inode, void *data) * file handle. * * @sb: the file system super block instance. - * @ref: The ORANGEFS object for which we are trying to locate an inode structure. + * @ref: The ORANGEFS object for which we are trying to locate an inode. */ -struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref) +struct inode *orangefs_iget(struct super_block *sb, + struct orangefs_object_kref *ref) { struct inode *inode = NULL; unsigned long hash; int error; hash = orangefs_handle_hash(ref); - inode = iget5_locked(sb, hash, orangefs_test_inode, orangefs_set_inode, ref); + inode = iget5_locked(sb, + hash, + orangefs_test_inode, + orangefs_set_inode, + ref); if (!inode || !(inode->i_state & I_NEW)) return inode; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 365cd73d9109..625b0580f9be 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -278,6 +278,13 @@ static int orangefs_symlink(struct inode *dir, ret = PTR_ERR(inode); goto out; } + /* + * This is necessary because orangefs_inode_getattr will not + * re-read symlink size as it is impossible for it to change. + * Invalidating the cache does not help. orangefs_new_inode + * does not set the correct size (it does not know symname). + */ + inode->i_size = strlen(symname); gossip_debug(GOSSIP_NAME_DEBUG, "Assigned symlink inode new number of %pU\n", diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 4f927023d095..c4e98c9c1621 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -138,7 +138,7 @@ static int get(struct slot_map *m) /* used to describe mapped buffers */ struct orangefs_bufmap_desc { - void *uaddr; /* user space address pointer */ + void __user *uaddr; /* user space address pointer */ struct page **page_array; /* array of mapped pages */ int array_count; /* size of above arrays */ struct list_head list_link; @@ -184,7 +184,7 @@ orangefs_bufmap_free(struct orangefs_bufmap *bufmap) } /* - * XXX: Can the size and shift change while the caller gives up the + * XXX: Can the size and shift change while the caller gives up the * XXX: lock between calling this and doing something useful? */ @@ -215,20 +215,6 @@ int orangefs_bufmap_shift_query(void) static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); -/* - * orangefs_get_bufmap_init - * - * If bufmap_init is 1, then the shared memory system, including the - * buffer_index_array, is available. Otherwise, it is not. - * - * returns the value of bufmap_init - */ -int orangefs_get_bufmap_init(void) -{ - return __orangefs_bufmap ? 1 : 0; -} - - static struct orangefs_bufmap * orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc) { @@ -496,7 +482,7 @@ void orangefs_readdir_index_put(int buffer_index) } /* - * we've been handed an iovec, we need to copy it to + * we've been handed an iovec, we need to copy it to * the shared memory descriptor at "buffer_index". */ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter, diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 6e35f2f3c897..0732cb08173e 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -114,7 +114,7 @@ static const struct seq_operations help_debug_ops = { .show = help_show, }; -const struct file_operations debug_help_fops = { +static const struct file_operations debug_help_fops = { .owner = THIS_MODULE, .open = orangefs_debug_help_open, .read = seq_read, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index c29bb0ebc6bb..004511617b6d 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -182,7 +182,6 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op) struct orangefs_inode_s { struct orangefs_object_kref refn; char link_target[ORANGEFS_NAME_MAX]; - __s64 blksize; /* * Reading/Writing Extended attributes need to acquire the appropriate * reader/writer semaphore on the orangefs_inode_s structure. diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 00fadaf0da8f..804c8a261e4b 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -183,9 +183,9 @@ static inline int copy_attributes_from_inode(struct inode *inode, attrs->mask |= ORANGEFS_ATTR_SYS_CTIME; /* - * ORANGEFS cannot set size with a setattr operation. Probably not likely - * to be requested through the VFS, but just in case, don't worry about - * ATTR_SIZE + * ORANGEFS cannot set size with a setattr operation. Probably not + * likely to be requested through the VFS, but just in case, don't + * worry about ATTR_SIZE */ if (iattr->ia_valid & ATTR_MODE) { @@ -200,14 +200,16 @@ static inline int copy_attributes_from_inode(struct inode *inode, tmp_mode -= S_ISVTX; } else { gossip_debug(GOSSIP_UTILS_DEBUG, - "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); + "%s: setting sticky bit not supported.\n", + __func__); return -EINVAL; } } if (tmp_mode & (S_ISUID)) { gossip_debug(GOSSIP_UTILS_DEBUG, - "Attempting to set setuid bit (not supported); returning EINVAL.\n"); + "%s: setting setuid bit not supported.\n", + __func__); return -EINVAL; } @@ -275,7 +277,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; - loff_t inode_size, rounded_up_size; + loff_t inode_size; int ret, type; gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__, @@ -330,22 +332,19 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, if (request_mask & STATX_SIZE || new) { inode_size = (loff_t)new_op-> downcall.resp.getattr.attributes.size; - rounded_up_size = - (inode_size + (4096 - (inode_size % 4096))); inode->i_size = inode_size; - orangefs_inode->blksize = - new_op->downcall.resp.getattr.attributes.blksize; + inode->i_blkbits = ffs(new_op->downcall.resp.getattr. + attributes.blksize); spin_lock(&inode->i_lock); inode->i_bytes = inode_size; inode->i_blocks = - (unsigned long)(rounded_up_size / 512); + (inode_size + 512 - inode_size % 512)/512; spin_unlock(&inode->i_lock); } break; case S_IFDIR: if (request_mask & STATX_SIZE || new) { inode->i_size = PAGE_SIZE; - orangefs_inode->blksize = i_blocksize(inode); spin_lock(&inode->i_lock); inode_set_bytes(inode, inode->i_size); spin_unlock(&inode->i_lock); @@ -356,7 +355,6 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, if (new) { inode->i_size = (loff_t)strlen(new_op-> downcall.resp.getattr.link_target); - orangefs_inode->blksize = i_blocksize(inode); ret = strscpy(orangefs_inode->link_target, new_op->downcall.resp.getattr.link_target, ORANGEFS_NAME_MAX); @@ -525,7 +523,9 @@ int orangefs_normalize_to_errno(__s32 error_code) error_code = -ETIMEDOUT; } else { /* assume a default error code */ - gossip_err("orangefs: warning: got error code without errno equivalent: %d.\n", error_code); + gossip_err("%s: bad error code :%d:.\n", + __func__, + error_code); error_code = -EINVAL; } @@ -542,7 +542,7 @@ int orangefs_normalize_to_errno(__s32 error_code) * there is a bug somewhere. */ } else { - gossip_err("orangefs: orangefs_normalize_to_errno: got error code which is not from ORANGEFS.\n"); + gossip_err("%s: unknown error code.\n", __func__); error_code = -EINVAL; } return error_code; diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 61ee8d64c842..d403cf29a99b 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -342,7 +342,7 @@ enum { * that may be 32 bit! */ struct ORANGEFS_dev_map_desc { - void *ptr; + void __user *ptr; __s32 total_size; __s32 size; __s32 count; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 10796d3fe27d..dfaee90d30bd 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -156,9 +156,10 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) sb = dentry->d_sb; gossip_debug(GOSSIP_SUPER_DEBUG, - "orangefs_statfs: called on sb %p (fs_id is %d)\n", - sb, - (int)(ORANGEFS_SB(sb)->fs_id)); + "%s: called on sb %p (fs_id is %d)\n", + __func__, + sb, + (int)(ORANGEFS_SB(sb)->fs_id)); new_op = op_alloc(ORANGEFS_VFS_OP_STATFS); if (!new_op) @@ -198,7 +199,7 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf) out_op_release: op_release(new_op); - gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_statfs: returning %d\n", ret); + gossip_debug(GOSSIP_SUPER_DEBUG, "%s: returning %d\n", __func__, ret); return ret; } @@ -423,8 +424,8 @@ static int orangefs_fill_sb(struct super_block *sb, sb->s_op = &orangefs_s_ops; sb->s_d_op = &orangefs_dentry_operations; - sb->s_blocksize = orangefs_bufmap_size_query(); - sb->s_blocksize_bits = orangefs_bufmap_shift_query(); + sb->s_blocksize = PAGE_SIZE; + sb->s_blocksize_bits = PAGE_SHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; root_object.khandle = ORANGEFS_SB(sb)->root_khandle; diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index 0577d6dba8c8..0729d2645d6a 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -17,8 +17,12 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -static int wait_for_matching_downcall(struct orangefs_kernel_op_s *, long, bool); -static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *); +static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, + long timeout, + bool interruptible) + __acquires(op->lock); +static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) + __releases(op->lock); /* * What we do in this function is to walk the list of operations that are @@ -246,6 +250,7 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op) */ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) + __releases(op->lock) { /* * handle interrupted cases depending on what state we were in when @@ -313,8 +318,9 @@ static void * Returns with op->lock taken. */ static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, - long timeout, - bool interruptible) + long timeout, + bool interruptible) + __acquires(op->lock) { long n; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 17032631c5cf..9384164253ac 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -11,7 +11,7 @@ config OVERLAY_FS For more information see Documentation/filesystems/overlayfs.txt config OVERLAY_FS_REDIRECT_DIR - bool "Overlayfs: turn on redirect dir feature by default" + bool "Overlayfs: turn on redirect directory feature by default" depends on OVERLAY_FS help If this config option is enabled then overlay filesystems will use @@ -46,7 +46,7 @@ config OVERLAY_FS_INDEX depends on OVERLAY_FS help If this config option is enabled then overlay filesystems will use - the inodes index dir to map lower inodes to upper inodes by default. + the index directory to map lower inodes to upper inodes by default. In this case it is still possible to turn off index globally with the "index=off" module option or on a filesystem instance basis with the "index=off" mount option. @@ -66,7 +66,7 @@ config OVERLAY_FS_NFS_EXPORT depends on OVERLAY_FS_INDEX help If this config option is enabled then overlay filesystems will use - the inodes index dir to decode overlay NFS file handles by default. + the index directory to decode overlay NFS file handles by default. In this case, it is still possible to turn off NFS export support globally with the "nfs_export=off" module option or on a filesystem instance basis with the "nfs_export=off" mount option. diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 8bede0742619..ddaddb4ce4c3 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -365,17 +365,14 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, if (err) return err; - temp = ovl_lookup_temp(indexdir); + temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0)); + err = PTR_ERR(temp); if (IS_ERR(temp)) - goto temp_err; - - err = ovl_do_mkdir(dir, temp, S_IFDIR, true); - if (err) - goto out; + goto free_name; err = ovl_set_upper_fh(upper, temp); if (err) - goto out_cleanup; + goto out; index = lookup_one_len(name.name, indexdir, name.len); if (IS_ERR(index)) { @@ -384,23 +381,13 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin, err = ovl_do_rename(dir, temp, dir, index, 0); dput(index); } - - if (err) - goto out_cleanup; - out: + if (err) + ovl_cleanup(dir, temp); dput(temp); +free_name: kfree(name.name); return err; - -temp_err: - err = PTR_ERR(temp); - temp = NULL; - goto out; - -out_cleanup: - ovl_cleanup(dir, temp); - goto out; } struct ovl_copy_up_ctx { @@ -439,8 +426,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) c->dentry->d_name.len); err = PTR_ERR(upper); if (!IS_ERR(upper)) { - err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper, - true); + err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper); dput(upper); if (!err) { @@ -470,7 +456,7 @@ static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp, return PTR_ERR(upper); if (c->tmpfile) - err = ovl_do_link(temp, udir, upper, true); + err = ovl_do_link(temp, udir, upper); else err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0); @@ -481,13 +467,13 @@ static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp, return err; } -static int ovl_get_tmpfile(struct ovl_copy_up_ctx *c, struct dentry **tempp) +static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c) { int err; struct dentry *temp; const struct cred *old_creds = NULL; struct cred *new_creds = NULL; - struct cattr cattr = { + struct ovl_cattr cattr = { /* Can't properly set mode on creation because of the umask */ .mode = c->stat.mode & S_IFMT, .rdev = c->stat.rdev, @@ -495,41 +481,24 @@ static int ovl_get_tmpfile(struct ovl_copy_up_ctx *c, struct dentry **tempp) }; err = security_inode_copy_up(c->dentry, &new_creds); + temp = ERR_PTR(err); if (err < 0) goto out; if (new_creds) old_creds = override_creds(new_creds); - if (c->tmpfile) { + if (c->tmpfile) temp = ovl_do_tmpfile(c->workdir, c->stat.mode); - if (IS_ERR(temp)) - goto temp_err; - } else { - temp = ovl_lookup_temp(c->workdir); - if (IS_ERR(temp)) - goto temp_err; - - err = ovl_create_real(d_inode(c->workdir), temp, &cattr, - NULL, true); - if (err) { - dput(temp); - goto out; - } - } - err = 0; - *tempp = temp; + else + temp = ovl_create_temp(c->workdir, &cattr); out: if (new_creds) { revert_creds(old_creds); put_cred(new_creds); } - return err; - -temp_err: - err = PTR_ERR(temp); - goto out; + return temp; } static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) @@ -579,21 +548,21 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) struct inode *udir = c->destdir->d_inode; struct inode *inode; struct dentry *newdentry = NULL; - struct dentry *temp = NULL; + struct dentry *temp; int err; - err = ovl_get_tmpfile(c, &temp); - if (err) - goto out; + temp = ovl_get_tmpfile(c); + if (IS_ERR(temp)) + return PTR_ERR(temp); err = ovl_copy_up_inode(c, temp); if (err) - goto out_cleanup; + goto out; if (S_ISDIR(c->stat.mode) && c->indexed) { err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp); if (err) - goto out_cleanup; + goto out; } if (c->tmpfile) { @@ -604,7 +573,7 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) err = ovl_install_temp(c, temp, &newdentry); } if (err) - goto out_cleanup; + goto out; inode = d_inode(c->dentry); ovl_inode_update(inode, newdentry); @@ -612,13 +581,11 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) ovl_set_flag(OVL_WHITEOUTS, inode); out: + if (err && !c->tmpfile) + ovl_cleanup(d_inode(c->workdir), temp); dput(temp); return err; -out_cleanup: - if (!c->tmpfile) - ovl_cleanup(d_inode(c->workdir), temp); - goto out; } /* diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 839709c7803a..f480b1a2cd2e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -43,7 +43,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry) return err; } -struct dentry *ovl_lookup_temp(struct dentry *workdir) +static struct dentry *ovl_lookup_temp(struct dentry *workdir) { struct dentry *temp; char name[20]; @@ -114,36 +114,72 @@ kill_whiteout: goto out; } -int ovl_create_real(struct inode *dir, struct dentry *newdentry, - struct cattr *attr, struct dentry *hardlink, bool debug) +static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry, + umode_t mode) { int err; + struct dentry *d, *dentry = *newdentry; + err = ovl_do_mkdir(dir, dentry, mode); + if (err) + return err; + + if (likely(!d_unhashed(dentry))) + return 0; + + /* + * vfs_mkdir() may succeed and leave the dentry passed + * to it unhashed and negative. If that happens, try to + * lookup a new hashed and positive dentry. + */ + d = lookup_one_len(dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); + if (IS_ERR(d)) { + pr_warn("overlayfs: failed lookup after mkdir (%pd2, err=%i).\n", + dentry, err); + return PTR_ERR(d); + } + dput(dentry); + *newdentry = d; + + return 0; +} + +struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr) +{ + int err; + + if (IS_ERR(newdentry)) + return newdentry; + + err = -ESTALE; if (newdentry->d_inode) - return -ESTALE; + goto out; - if (hardlink) { - err = ovl_do_link(hardlink, dir, newdentry, debug); + if (attr->hardlink) { + err = ovl_do_link(attr->hardlink, dir, newdentry); } else { switch (attr->mode & S_IFMT) { case S_IFREG: - err = ovl_do_create(dir, newdentry, attr->mode, debug); + err = ovl_do_create(dir, newdentry, attr->mode); break; case S_IFDIR: - err = ovl_do_mkdir(dir, newdentry, attr->mode, debug); + /* mkdir is special... */ + err = ovl_mkdir_real(dir, &newdentry, attr->mode); break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - err = ovl_do_mknod(dir, newdentry, - attr->mode, attr->rdev, debug); + err = ovl_do_mknod(dir, newdentry, attr->mode, + attr->rdev); break; case S_IFLNK: - err = ovl_do_symlink(dir, newdentry, attr->link, debug); + err = ovl_do_symlink(dir, newdentry, attr->link); break; default: @@ -155,9 +191,20 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, * Not quite sure if non-instantiated dentry is legal or not. * VFS doesn't seem to care so check and warn here. */ - err = -ENOENT; + err = -EIO; } - return err; +out: + if (err) { + dput(newdentry); + return ERR_PTR(err); + } + return newdentry; +} + +struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr) +{ + return ovl_create_real(d_inode(workdir), ovl_lookup_temp(workdir), + attr); } static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, @@ -182,24 +229,54 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) return ovl_set_opaque_xerr(dentry, upperdentry, -EIO); } -/* Common operations required to be done after creation of file on upper */ -static void ovl_instantiate(struct dentry *dentry, struct inode *inode, - struct dentry *newdentry, bool hardlink) +/* + * Common operations required to be done after creation of file on upper. + * If @hardlink is false, then @inode is a pre-allocated inode, we may or + * may not use to instantiate the new dentry. + */ +static int ovl_instantiate(struct dentry *dentry, struct inode *inode, + struct dentry *newdentry, bool hardlink) { + struct ovl_inode_params oip = { + .upperdentry = newdentry, + .newinode = inode, + }; + ovl_dentry_version_inc(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); if (!hardlink) { - ovl_inode_update(inode, newdentry); - ovl_copyattr(newdentry->d_inode, inode); + /* + * ovl_obtain_alias() can be called after ovl_create_real() + * and before we get here, so we may get an inode from cache + * with the same real upperdentry that is not the inode we + * pre-allocated. In this case we will use the cached inode + * to instantiate the new dentry. + * + * XXX: if we ever use ovl_obtain_alias() to decode directory + * file handles, need to use ovl_get_inode_locked() and + * d_instantiate_new() here to prevent from creating two + * hashed directory inode aliases. + */ + inode = ovl_get_inode(dentry->d_sb, &oip); + if (WARN_ON(IS_ERR(inode))) + return PTR_ERR(inode); } else { WARN_ON(ovl_inode_real(inode) != d_inode(newdentry)); dput(newdentry); inc_nlink(inode); } + d_instantiate(dentry, inode); + if (inode != oip.newinode) { + pr_warn_ratelimited("overlayfs: newly created inode found in cache (%pd2)\n", + dentry); + } + /* Force lookup of new upper hardlink to find its lower */ if (hardlink) d_drop(dentry); + + return 0; } static bool ovl_type_merge(struct dentry *dentry) @@ -213,38 +290,42 @@ static bool ovl_type_origin(struct dentry *dentry) } static int ovl_create_upper(struct dentry *dentry, struct inode *inode, - struct cattr *attr, struct dentry *hardlink) + struct ovl_cattr *attr) { struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *udir = upperdir->d_inode; struct dentry *newdentry; int err; - if (!hardlink && !IS_POSIXACL(udir)) + if (!attr->hardlink && !IS_POSIXACL(udir)) attr->mode &= ~current_umask(); inode_lock_nested(udir, I_MUTEX_PARENT); - newdentry = lookup_one_len(dentry->d_name.name, upperdir, - dentry->d_name.len); + newdentry = ovl_create_real(udir, + lookup_one_len(dentry->d_name.name, + upperdir, + dentry->d_name.len), + attr); err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_unlock; - err = ovl_create_real(udir, newdentry, attr, hardlink, false); - if (err) - goto out_dput; if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { /* Setting opaque here is just an optimization, allow to fail */ ovl_set_opaque(dentry, newdentry); } - ovl_instantiate(dentry, inode, newdentry, !!hardlink); - newdentry = NULL; -out_dput: - dput(newdentry); + err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink); + if (err) + goto out_cleanup; out_unlock: inode_unlock(udir); return err; + +out_cleanup: + ovl_cleanup(udir, newdentry); + dput(newdentry); + goto out_unlock; } static struct dentry *ovl_clear_empty(struct dentry *dentry, @@ -280,16 +361,11 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (upper->d_parent->d_inode != udir) goto out_unlock; - opaquedir = ovl_lookup_temp(workdir); + opaquedir = ovl_create_temp(workdir, OVL_CATTR(stat.mode)); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) goto out_unlock; - err = ovl_create_real(wdir, opaquedir, - &(struct cattr){.mode = stat.mode}, NULL, true); - if (err) - goto out_dput; - err = ovl_copy_xattr(upper, opaquedir); if (err) goto out_cleanup; @@ -319,7 +395,6 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, out_cleanup: ovl_cleanup(wdir, opaquedir); -out_dput: dput(opaquedir); out_unlock: unlock_rename(workdir, upperdir); @@ -354,8 +429,7 @@ out_free: } static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, - struct cattr *cattr, - struct dentry *hardlink) + struct ovl_cattr *cattr) { struct dentry *workdir = ovl_workdir(dentry); struct inode *wdir = workdir->d_inode; @@ -365,6 +439,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; struct posix_acl *acl, *default_acl; + bool hardlink = !!cattr->hardlink; if (WARN_ON(!workdir)) return -EROFS; @@ -380,20 +455,16 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out; - newdentry = ovl_lookup_temp(workdir); - err = PTR_ERR(newdentry); - if (IS_ERR(newdentry)) - goto out_unlock; - upper = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); err = PTR_ERR(upper); if (IS_ERR(upper)) - goto out_dput; + goto out_unlock; - err = ovl_create_real(wdir, newdentry, cattr, hardlink, true); - if (err) - goto out_dput2; + newdentry = ovl_create_temp(workdir, cattr); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_dput; /* * mode could have been mutilated due to umask (e.g. sgid directory) @@ -439,12 +510,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (err) goto out_cleanup; } - ovl_instantiate(dentry, inode, newdentry, !!hardlink); - newdentry = NULL; -out_dput2: - dput(upper); + err = ovl_instantiate(dentry, inode, newdentry, hardlink); + if (err) + goto out_cleanup; out_dput: - dput(newdentry); + dput(upper); out_unlock: unlock_rename(workdir, upperdir); out: @@ -456,12 +526,12 @@ out: out_cleanup: ovl_cleanup(wdir, newdentry); - goto out_dput2; + dput(newdentry); + goto out_dput; } static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, - struct cattr *attr, struct dentry *hardlink, - bool origin) + struct ovl_cattr *attr, bool origin) { int err; const struct cred *old_cred; @@ -489,7 +559,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, if (override_cred) { override_cred->fsuid = inode->i_uid; override_cred->fsgid = inode->i_gid; - if (!hardlink) { + if (!attr->hardlink) { err = security_dentry_create_files_as(dentry, attr->mode, &dentry->d_name, old_cred, override_cred); @@ -502,21 +572,12 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, put_cred(override_cred); if (!ovl_dentry_is_whiteout(dentry)) - err = ovl_create_upper(dentry, inode, attr, - hardlink); + err = ovl_create_upper(dentry, inode, attr); else - err = ovl_create_over_whiteout(dentry, inode, attr, - hardlink); + err = ovl_create_over_whiteout(dentry, inode, attr); } out_revert_creds: revert_creds(old_cred); - if (!err) { - struct inode *realinode = d_inode(ovl_dentry_upper(dentry)); - - WARN_ON(inode->i_mode != realinode->i_mode); - WARN_ON(!uid_eq(inode->i_uid, realinode->i_uid)); - WARN_ON(!gid_eq(inode->i_gid, realinode->i_gid)); - } return err; } @@ -525,7 +586,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, { int err; struct inode *inode; - struct cattr attr = { + struct ovl_cattr attr = { .rdev = rdev, .link = link, }; @@ -534,6 +595,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, if (err) goto out; + /* Preallocate inode to be used by ovl_get_inode() */ err = -ENOMEM; inode = ovl_new_inode(dentry->d_sb, mode, rdev); if (!inode) @@ -542,8 +604,9 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode_init_owner(inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; - err = ovl_create_or_link(dentry, inode, &attr, NULL, false); - if (err) + err = ovl_create_or_link(dentry, inode, &attr, false); + /* Did we end up using the preallocated inode? */ + if (inode != d_inode(dentry)) iput(inode); out_drop_write: @@ -601,8 +664,9 @@ static int ovl_link(struct dentry *old, struct inode *newdir, inode = d_inode(old); ihold(inode); - err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old), - ovl_type_origin(old)); + err = ovl_create_or_link(new, inode, + &(struct ovl_cattr) {.hardlink = ovl_dentry_upper(old)}, + ovl_type_origin(old)); if (err) iput(inode); diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 425a94672300..9941ece61a14 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -300,12 +300,18 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb, struct dentry *dentry; struct inode *inode; struct ovl_entry *oe; + struct ovl_inode_params oip = { + .lowerpath = lowerpath, + .index = index, + .numlower = !!lower + }; /* We get overlay directory dentries with ovl_lookup_real() */ if (d_is_dir(upper ?: lower)) return ERR_PTR(-EIO); - inode = ovl_get_inode(sb, dget(upper), lowerpath, index, !!lower); + oip.upperdentry = dget(upper); + inode = ovl_get_inode(sb, &oip); if (IS_ERR(inode)) { dput(upper); return ERR_CAST(inode); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 6e3815fb006b..1db5b3b458a1 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -749,15 +749,26 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, return true; } -struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct ovl_path *lowerpath, struct dentry *index, - unsigned int numlower) +static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, + struct inode *key) { + return newinode ? inode_insert5(newinode, (unsigned long) key, + ovl_inode_test, ovl_inode_set, key) : + iget5_locked(sb, (unsigned long) key, + ovl_inode_test, ovl_inode_set, key); +} + +struct inode *ovl_get_inode(struct super_block *sb, + struct ovl_inode_params *oip) +{ + struct dentry *upperdentry = oip->upperdentry; + struct ovl_path *lowerpath = oip->lowerpath; struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; - bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); - int fsid = bylower ? lowerpath->layer->fsid : 0; + bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, + oip->index); + int fsid = bylower ? oip->lowerpath->layer->fsid : 0; bool is_dir; unsigned long ino = 0; @@ -774,8 +785,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, upperdentry); unsigned int nlink = is_dir ? 1 : realinode->i_nlink; - inode = iget5_locked(sb, (unsigned long) key, - ovl_inode_test, ovl_inode_set, key); + inode = ovl_iget5(sb, oip->newinode, key); if (!inode) goto out_nomem; if (!(inode->i_state & I_NEW)) { @@ -811,12 +821,12 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); - if (index) + if (oip->index) ovl_set_flag(OVL_INDEX, inode); /* Check for non-merge dir that may have whiteouts */ if (is_dir) { - if (((upperdentry && lowerdentry) || numlower > 1) || + if (((upperdentry && lowerdentry) || oip->numlower > 1) || ovl_check_origin_xattr(upperdentry ?: lowerdentry)) { ovl_set_flag(OVL_WHITEOUTS, inode); } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 2dba29eadde6..08801b45df00 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -1004,8 +1004,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, upperdentry = dget(index); if (upperdentry || ctr) { - inode = ovl_get_inode(dentry->d_sb, upperdentry, stack, index, - ctr); + struct ovl_inode_params oip = { + .upperdentry = upperdentry, + .lowerpath = stack, + .index = index, + .numlower = ctr, + }; + + inode = ovl_get_inode(dentry->d_sb, &oip); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e0b7de799f6b..3c5e9f18b0d9 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -86,6 +86,7 @@ struct ovl_fh { static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { int err = vfs_rmdir(dir, dentry); + pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; } @@ -93,56 +94,52 @@ static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) { int err = vfs_unlink(dir, dentry, NULL); + pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; } static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry, bool debug) + struct dentry *new_dentry) { int err = vfs_link(old_dentry, dir, new_dentry, NULL); - if (debug) { - pr_debug("link(%pd2, %pd2) = %i\n", - old_dentry, new_dentry, err); - } + + pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; } static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool debug) + umode_t mode) { int err = vfs_create(dir, dentry, mode, true); - if (debug) - pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, - umode_t mode, bool debug) + umode_t mode) { int err = vfs_mkdir(dir, dentry, mode); - if (debug) - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t dev, bool debug) + umode_t mode, dev_t dev) { int err = vfs_mknod(dir, dentry, mode, dev); - if (debug) { - pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", - dentry, mode, dev, err); - } + + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; } static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, - const char *oldname, bool debug) + const char *oldname) { int err = vfs_symlink(dir, dentry, oldname); - if (debug) - pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; } @@ -168,11 +165,8 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, { int err; - pr_debug("rename(%pd2, %pd2, 0x%x)\n", - olddentry, newdentry, flags); - + pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); - if (err) { pr_debug("...rename(%pd2, %pd2, ...) = %i\n", olddentry, newdentry, err); @@ -334,12 +328,18 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); bool ovl_is_private_xattr(const char *name); +struct ovl_inode_params { + struct inode *newinode; + struct dentry *upperdentry; + struct ovl_path *lowerpath; + struct dentry *index; + unsigned int numlower; +}; struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, bool is_upper); -struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, - struct ovl_path *lowerpath, struct dentry *index, - unsigned int numlower); +struct inode *ovl_get_inode(struct super_block *sb, + struct ovl_inode_params *oip); static inline void ovl_copyattr(struct inode *from, struct inode *to) { to->i_uid = from->i_uid; @@ -352,18 +352,21 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; -struct dentry *ovl_lookup_temp(struct dentry *workdir); int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir, struct dentry *dentry); -struct cattr { +struct ovl_cattr { dev_t rdev; umode_t mode; const char *link; + struct dentry *hardlink; }; -int ovl_create_real(struct inode *dir, struct dentry *newdentry, - struct cattr *attr, - struct dentry *hardlink, bool debug); + +#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) + +struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct ovl_cattr *attr); int ovl_cleanup(struct inode *dir, struct dentry *dentry); +struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index e8551c97de51..704b37311467 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -611,11 +611,10 @@ retry: goto retry; } - err = ovl_create_real(dir, work, - &(struct cattr){.mode = S_IFDIR | 0}, - NULL, true); - if (err) - goto out_dput; + work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode)); + err = PTR_ERR(work); + if (IS_ERR(work)) + goto out_err; /* * Try to remove POSIX ACL xattrs from workdir. We are good if: diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 1ade1206bb89..0eaeb41453f5 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -43,6 +43,21 @@ config PROC_VMCORE help Exports the dump image of crashed kernel in ELF format. +config PROC_VMCORE_DEVICE_DUMP + bool "Device Hardware/Firmware Log Collection" + depends on PROC_VMCORE + default n + help + After kernel panic, device drivers can collect the device + specific snapshot of their hardware or firmware before the + underlying devices are initialized in crash recovery kernel. + Note that the device driver must be present in the crash + recovery kernel's initramfs to collect its underlying device + snapshot. + + If you say Y here, the collected device dumps will be added + as ELF notes to /proc/vmcore. + config PROC_SYSCTL bool "Sysctl support (/proc/sys)" if EXPERT depends on PROC_FS diff --git a/fs/proc/array.c b/fs/proc/array.c index e6d7f41b6684..0ceb3b6b37e7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -96,22 +96,29 @@ #include <asm/processor.h> #include "internal.h" -static inline void task_name(struct seq_file *m, struct task_struct *p) +void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { char *buf; size_t size; - char tcomm[sizeof(p->comm)]; + char tcomm[64]; int ret; - get_task_comm(tcomm, p); - - seq_puts(m, "Name:\t"); + if (p->flags & PF_WQ_WORKER) + wq_worker_comm(tcomm, sizeof(tcomm), p); + else + __get_task_comm(tcomm, sizeof(tcomm), p); size = seq_get_buf(m, &buf); - ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); - seq_commit(m, ret < size ? ret : -1); + if (escape) { + ret = string_escape_str(tcomm, buf, size, + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + if (ret >= size) + ret = -1; + } else { + ret = strscpy(buf, tcomm, size); + } - seq_putc(m, '\n'); + seq_commit(m, ret); } /* @@ -261,7 +268,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; - unsigned long qsize = 0; + unsigned int qsize = 0; unsigned long qlim = 0; sigemptyset(&pending); @@ -390,7 +397,10 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, { struct mm_struct *mm = get_task_mm(task); - task_name(m, task); + seq_puts(m, "Name:\t"); + proc_task_name(m, task, true); + seq_putc(m, '\n'); + task_state(m, ns, pid, task); if (mm) { @@ -425,7 +435,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, u64 cutime, cstime, utime, stime; u64 cgtime, gtime; unsigned long rsslim = 0; - char tcomm[sizeof(task->comm)]; unsigned long flags; state = *get_task_state(task); @@ -452,8 +461,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } } - get_task_comm(tcomm, task); - sigemptyset(&sigign); sigemptyset(&sigcatch); cutime = cstime = utime = stime = 0; @@ -520,7 +527,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns)); seq_puts(m, " ("); - seq_puts(m, tcomm); + proc_task_name(m, task, false); seq_puts(m, ") "); seq_putc(m, state); seq_put_decimal_ll(m, " ", ppid); diff --git a/fs/proc/base.c b/fs/proc/base.c index 33ed1746927a..4aa9ce5df02f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -205,171 +205,129 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, - size_t _count, loff_t *pos) +static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf, + size_t count, loff_t *ppos) { - struct task_struct *tsk; - struct mm_struct *mm; - char *page; - unsigned long count = _count; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long len1, len2, len; - unsigned long p; - char c; - ssize_t rv; - - BUG_ON(*pos < 0); + unsigned long pos, len; + char *page; - tsk = get_proc_task(file_inode(file)); - if (!tsk) - return -ESRCH; - mm = get_task_mm(tsk); - put_task_struct(tsk); - if (!mm) - return 0; /* Check if process spawned far enough to have cmdline. */ - if (!mm->env_end) { - rv = 0; - goto out_mmput; - } - - page = (char *)__get_free_page(GFP_KERNEL); - if (!page) { - rv = -ENOMEM; - goto out_mmput; - } + if (!mm->env_end) + return 0; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); - - BUG_ON(arg_start > arg_end); - BUG_ON(env_start > env_end); + spin_unlock(&mm->arg_lock); - len1 = arg_end - arg_start; - len2 = env_end - env_start; + if (arg_start >= arg_end) + return 0; - /* Empty ARGV. */ - if (len1 == 0) { - rv = 0; - goto out_free_page; - } /* - * Inherently racy -- command line shares address space - * with code and data. + * We have traditionally allowed the user to re-write + * the argument strings and overflow the end result + * into the environment section. But only do that if + * the environment area is contiguous to the arguments. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON); - if (rv <= 0) - goto out_free_page; - - rv = 0; - - if (c == '\0') { - /* Command line (set of strings) occupies whole ARGV. */ - if (len1 <= *pos) - goto out_free_page; - - p = arg_start + *pos; - len = len1 - *pos; - while (count > 0 && len > 0) { - unsigned int _count; - int nr_read; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } + if (env_start != arg_end || env_start >= env_end) + env_start = env_end = arg_end; - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; - } - } else { - /* - * Command line (1 string) occupies ARGV and - * extends into ENVP. - */ - struct { - unsigned long p; - unsigned long len; - } cmdline[2] = { - { .p = arg_start, .len = len1 }, - { .p = env_start, .len = len2 }, - }; - loff_t pos1 = *pos; - unsigned int i; + /* We're not going to care if "*ppos" has high bits set */ + pos = arg_start + *ppos; + + /* .. but we do check the result is in the proper range */ + if (pos < arg_start || pos >= env_end) + return 0; + + /* .. and we never go past env_end */ + if (env_end - pos < count) + count = env_end - pos; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + len = 0; + while (count) { + int got; + size_t size = min_t(size_t, PAGE_SIZE, count); + + got = access_remote_vm(mm, pos, page, size, FOLL_ANON); + if (got <= 0) + break; + + /* Don't walk past a NUL character once you hit arg_end */ + if (pos + got >= arg_end) { + int n = 0; - i = 0; - while (i < 2 && pos1 >= cmdline[i].len) { - pos1 -= cmdline[i].len; - i++; + /* + * If we started before 'arg_end' but ended up + * at or after it, we start the NUL character + * check at arg_end-1 (where we expect the normal + * EOF to be). + * + * NOTE! This is smaller than 'got', because + * pos + got >= arg_end + */ + if (pos < arg_end) + n = arg_end - pos - 1; + + /* Cut off at first NUL after 'n' */ + got = n + strnlen(page+n, got-n); + if (!got) + break; } - while (i < 2) { - p = cmdline[i].p + pos1; - len = cmdline[i].len - pos1; - while (count > 0 && len > 0) { - unsigned int _count, l; - int nr_read; - bool final; - - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) - goto out_free_page; - - /* - * Command line can be shorter than whole ARGV - * even if last "marker" byte says it is not. - */ - final = false; - l = strnlen(page, nr_read); - if (l < nr_read) { - nr_read = l; - final = true; - } - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } - - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; - - if (final) - goto out_free_page; - } - /* Only first chunk can be read partially. */ - pos1 = 0; - i++; + got -= copy_to_user(buf, page, got); + if (unlikely(!got)) { + if (!len) + len = -EFAULT; + break; } + pos += got; + buf += got; + len += got; + count -= got; } -out_free_page: free_page((unsigned long)page); -out_mmput: + return len; +} + +static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf, + size_t count, loff_t *pos) +{ + struct mm_struct *mm; + ssize_t ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = get_mm_cmdline(mm, buf, count, pos); mmput(mm); - if (rv > 0) - *pos += rv; - return rv; + return ret; +} + +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct task_struct *tsk; + ssize_t ret; + + BUG_ON(*pos < 0); + + tsk = get_proc_task(file_inode(file)); + if (!tsk) + return -ESRCH; + ret = get_task_cmdline(tsk, buf, count, pos); + put_task_struct(tsk); + if (ret > 0) + *pos += ret; + return ret; } static const struct file_operations proc_pid_cmdline_ops = { @@ -430,7 +388,6 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct stack_trace trace; unsigned long *entries; int err; - int i; entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); if (!entries) @@ -443,6 +400,8 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, err = lock_trace(task); if (!err) { + unsigned int i; + save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { @@ -927,10 +886,10 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; @@ -1563,9 +1522,8 @@ static int comm_show(struct seq_file *m, void *v) if (!p) return -ESRCH; - task_lock(p); - seq_printf(m, "%s\n", p->comm); - task_unlock(p); + proc_task_name(m, p, false); + seq_putc(m, '\n'); put_task_struct(p); @@ -1785,9 +1743,9 @@ int pid_getattr(const struct path *path, struct kstat *stat, generic_fillattr(inode, stat); - rcu_read_lock(); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; + rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) { @@ -1876,7 +1834,7 @@ const struct dentry_operations pid_dentry_operations = * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, - const char *name, int len, + const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; @@ -1895,19 +1853,19 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, struct dentry *res; res = instantiate(child, task, ptr); d_lookup_done(child); - if (IS_ERR(res)) - goto end_instantiate; if (unlikely(res)) { dput(child); child = res; + if (IS_ERR(child)) + goto end_instantiate; } } } inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; -end_instantiate: dput(child); +end_instantiate: return dir_emit(ctx, name, len, ino, type); } @@ -3252,7 +3210,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[10 + 1]; - int len; + unsigned int len; cond_resched(); if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) @@ -3579,7 +3537,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) task; task = next_tid(task), ctx->pos++) { char name[10 + 1]; - int len; + unsigned int len; tid = task_pid_nr_ns(task, ns); len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 05b9893e9a22..81882a13212d 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -248,7 +248,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, struct file *f; struct fd_data data; char name[10 + 1]; - int len; + unsigned int len; f = fcheck_files(files, fd); if (!f) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 43c70c9e6b62..50cb22a08c2f 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -136,6 +136,8 @@ unsigned name_to_int(const struct qstr *qstr); */ extern const struct file_operations proc_tid_children_operations; +extern void proc_task_name(struct seq_file *m, struct task_struct *p, + bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, @@ -161,7 +163,7 @@ extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); -extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, +bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* diff --git a/fs/proc/page.c b/fs/proc/page.c index 1491918a33c3..792c78a49174 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -154,6 +154,8 @@ u64 stable_page_flags(struct page *page) if (PageBalloon(page)) u |= 1 << KPF_BALLOON; + if (PageTable(page)) + u |= 1 << KPF_PGTABLE; if (page_is_idle(page)) u |= 1 << KPF_IDLE; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a20c6e495bb2..597969db9e90 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -18,6 +18,7 @@ #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/uaccess.h> +#include <linux/pkeys.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -673,13 +674,16 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MERGEABLE)] = "mg", [ilog2(VM_UFFD_MISSING)]= "um", [ilog2(VM_UFFD_WP)] = "uw", -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#ifdef CONFIG_ARCH_HAS_PKEYS /* These come out via ProtectionKey: */ [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", [ilog2(VM_PKEY_BIT3)] = "", +#if VM_PKEY_BIT4 + [ilog2(VM_PKEY_BIT4)] = "", #endif +#endif /* CONFIG_ARCH_HAS_PKEYS */ }; size_t i; @@ -727,10 +731,6 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } #endif /* HUGETLB_PAGE */ -void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) -{ -} - #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) >> 10, 8) static int show_smap(struct seq_file *m, void *v, int is_pid) @@ -835,7 +835,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) seq_puts(m, " kB\n"); } if (!rollup_mode) { - arch_show_smap(m, vma); + if (arch_pkeys_enabled()) + seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); show_smap_vma_flags(m, vma); } m_cache_vma(m, vma); @@ -1258,8 +1259,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; entry = pte_to_swp_entry(pte); - frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); @@ -1310,11 +1312,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); - unsigned long offset = swp_offset(entry); + unsigned long offset; - offset += (addr & ~PMD_MASK) >> PAGE_SHIFT; - frame = swp_type(entry) | - (offset << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) { + offset = swp_offset(entry) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; @@ -1332,10 +1337,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, err = add_to_pagemap(addr, &pme, pm); if (err) break; - if (pm->show_pfn && (flags & PM_PRESENT)) - frame++; - else if (flags & PM_SWAP) - frame += (1 << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); + } } spin_unlock(ptl); return err; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a45f0af22a60..cfb6674331fd 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/crash_dump.h> #include <linux/list.h> +#include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/uaccess.h> @@ -38,12 +39,23 @@ static size_t elfcorebuf_sz_orig; static char *elfnotes_buf; static size_t elfnotes_sz; +/* Size of all notes minus the device dump notes */ +static size_t elfnotes_orig_sz; /* Total size of vmcore file. */ static u64 vmcore_size; static struct proc_dir_entry *proc_vmcore; +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/* Device Dump list and mutex to synchronize access to list */ +static LIST_HEAD(vmcoredd_list); +static DEFINE_MUTEX(vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Device Dump Size */ +static size_t vmcoredd_orig_sz; + /* * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error * The called function has to take care of module refcounting. @@ -178,6 +190,77 @@ static int copy_to(void *target, void *src, size_t size, int userbuf) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (copy_to(dst, buf, tsz, userbuf)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} + +static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, + u64 start, size_t size) +{ + struct vmcoredd_node *dump; + u64 offset = 0; + int ret = 0; + size_t tsz; + char *buf; + + mutex_lock(&vmcoredd_mutex); + list_for_each_entry(dump, &vmcoredd_list, list) { + if (start < offset + dump->size) { + tsz = min(offset + (u64)dump->size - start, (u64)size); + buf = dump->buf + start - offset; + if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) { + ret = -EFAULT; + goto out_unlock; + } + + size -= tsz; + start += tsz; + dst += tsz; + + /* Leave now if buffer filled already */ + if (!size) + goto out_unlock; + } + offset += dump->size; + } + +out_unlock: + mutex_unlock(&vmcoredd_mutex); + return ret; +} +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -215,10 +298,41 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, if (*fpos < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) { + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)*fpos, buflen); + start = *fpos - elfcorebuf_sz; + if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf)) + return -EFAULT; + + buflen -= tsz; + *fpos += tsz; + buffer += tsz; + acc += tsz; + + /* leave now if filled buffer already */ + if (!buflen) + return acc; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); - kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; + kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz; if (copy_to(buffer, kaddr, tsz, userbuf)) return -EFAULT; + buflen -= tsz; *fpos += tsz; buffer += tsz; @@ -302,10 +416,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = { }; /** - * alloc_elfnotes_buf - allocate buffer for ELF note segment in - * vmalloc memory - * - * @notes_sz: size of buffer + * vmcore_alloc_buf - allocate buffer in vmalloc memory + * @sizez: size of buffer * * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap * the buffer to user-space by means of remap_vmalloc_range(). @@ -313,12 +425,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = { * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is * disabled and there's no need to allow users to mmap the buffer. */ -static inline char *alloc_elfnotes_buf(size_t notes_sz) +static inline char *vmcore_alloc_buf(size_t size) { #ifdef CONFIG_MMU - return vmalloc_user(notes_sz); + return vmalloc_user(size); #else - return vzalloc(notes_sz); + return vzalloc(size); #endif } @@ -446,11 +558,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) if (start < elfcorebuf_sz + elfnotes_sz) { void *kaddr; + /* We add device dumps before other elf notes because the + * other elf notes may not fill the elf notes buffer + * completely and we will end up with zero-filled data + * between the elf notes and the device dumps. Tools will + * then try to decode this zero-filled data as valid notes + * and we don't want that. Hence, adding device dumps before + * the other elf notes ensure that zero-filled data can be + * avoided. This also ensures that the device dumps and + * other elf notes can be properly mmaped at page aligned + * address. + */ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + /* Read device dumps */ + if (start < elfcorebuf_sz + vmcoredd_orig_sz) { + u64 start_off; + + tsz = min(elfcorebuf_sz + vmcoredd_orig_sz - + (size_t)start, size); + start_off = start - elfcorebuf_sz; + if (vmcoredd_mmap_dumps(vma, vma->vm_start + len, + start_off, tsz)) + goto fail; + + size -= tsz; + start += tsz; + len += tsz; + + /* leave now if filled buffer already */ + if (!size) + return 0; + } +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + + /* Read remaining elf notes */ tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); - kaddr = elfnotes_buf + start - elfcorebuf_sz; + kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, kaddr, tsz)) goto fail; + size -= tsz; start += tsz; len += tsz; @@ -502,8 +649,8 @@ static struct vmcore* __init get_new_element(void) return kzalloc(sizeof(struct vmcore), GFP_KERNEL); } -static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz, - struct list_head *vc_list) +static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz, + struct list_head *vc_list) { u64 size; struct vmcore *m; @@ -665,7 +812,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -698,6 +845,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -851,7 +1003,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, return rc; *notes_sz = roundup(phdr_sz, PAGE_SIZE); - *notes_buf = alloc_elfnotes_buf(*notes_sz); + *notes_buf = vmcore_alloc_buf(*notes_sz); if (!*notes_buf) return -ENOMEM; @@ -884,6 +1036,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz, /* Modify e_phnum to reflect merged headers. */ ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1; + /* Store the size of all notes. We need this to update the note + * header when the device dumps will be added. + */ + elfnotes_orig_sz = phdr.p_memsz; + return 0; } @@ -976,8 +1133,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr, } /* Sets offset fields of vmcore elements. */ -static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, - struct list_head *vc_list) +static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz, + struct list_head *vc_list) { loff_t vmcore_off; struct vmcore *m; @@ -1145,6 +1302,202 @@ static int __init parse_crash_elf_headers(void) return 0; } +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP +/** + * vmcoredd_write_header - Write vmcore device dump header at the + * beginning of the dump's buffer. + * @buf: Output buffer where the note is written + * @data: Dump info + * @size: Size of the dump + * + * Fills beginning of the dump's buffer with vmcore device dump header. + */ +static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data, + u32 size) +{ + struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf; + + vdd_hdr->n_namesz = sizeof(vdd_hdr->name); + vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name); + vdd_hdr->n_type = NT_VMCOREDD; + + strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME, + sizeof(vdd_hdr->name)); + memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name)); +} + +/** + * vmcoredd_update_program_headers - Update all Elf program headers + * @elfptr: Pointer to elf header + * @elfnotesz: Size of elf notes aligned to page size + * @vmcoreddsz: Size of device dumps to be added to elf note header + * + * Determine type of Elf header (Elf64 or Elf32) and update the elf note size. + * Also update the offsets of all the program headers after the elf note header. + */ +static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz, + size_t vmcoreddsz) +{ + unsigned char *e_ident = (unsigned char *)elfptr; + u64 start, end, size; + loff_t vmcore_off; + u32 i; + + vmcore_off = elfcorebuf_sz + elfnotesz; + + if (e_ident[EI_CLASS] == ELFCLASS64) { + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr; + Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } else { + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr; + Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr)); + + /* Update all program headers */ + for (i = 0; i < ehdr->e_phnum; i++, phdr++) { + if (phdr->p_type == PT_NOTE) { + /* Update note size */ + phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz; + phdr->p_filesz = phdr->p_memsz; + continue; + } + + start = rounddown(phdr->p_offset, PAGE_SIZE); + end = roundup(phdr->p_offset + phdr->p_memsz, + PAGE_SIZE); + size = end - start; + phdr->p_offset = vmcore_off + (phdr->p_offset - start); + vmcore_off += size; + } + } +} + +/** + * vmcoredd_update_size - Update the total size of the device dumps and update + * Elf header + * @dump_size: Size of the current device dump to be added to total size + * + * Update the total size of all the device dumps and update the Elf program + * headers. Calculate the new offsets for the vmcore list and update the + * total vmcore size. + */ +static void vmcoredd_update_size(size_t dump_size) +{ + vmcoredd_orig_sz += dump_size; + elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz; + vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz, + vmcoredd_orig_sz); + + /* Update vmcore list offsets */ + set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list); + + vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz, + &vmcore_list); + proc_vmcore->size = vmcore_size; +} + +/** + * vmcore_add_device_dump - Add a buffer containing device dump to vmcore + * @data: dump info. + * + * Allocate a buffer and invoke the calling driver's dump collect routine. + * Write Elf note at the beginning of the buffer to indicate vmcore device + * dump and add the dump to global list. + */ +int vmcore_add_device_dump(struct vmcoredd_data *data) +{ + struct vmcoredd_node *dump; + void *buf = NULL; + size_t data_size; + int ret; + + if (!data || !strlen(data->dump_name) || + !data->vmcoredd_callback || !data->size) + return -EINVAL; + + dump = vzalloc(sizeof(*dump)); + if (!dump) { + ret = -ENOMEM; + goto out_err; + } + + /* Keep size of the buffer page aligned so that it can be mmaped */ + data_size = roundup(sizeof(struct vmcoredd_header) + data->size, + PAGE_SIZE); + + /* Allocate buffer for driver's to write their dumps */ + buf = vmcore_alloc_buf(data_size); + if (!buf) { + ret = -ENOMEM; + goto out_err; + } + + vmcoredd_write_header(buf, data, data_size - + sizeof(struct vmcoredd_header)); + + /* Invoke the driver's dump collection routing */ + ret = data->vmcoredd_callback(data, buf + + sizeof(struct vmcoredd_header)); + if (ret) + goto out_err; + + dump->buf = buf; + dump->size = data_size; + + /* Add the dump to driver sysfs list */ + mutex_lock(&vmcoredd_mutex); + list_add_tail(&dump->list, &vmcoredd_list); + mutex_unlock(&vmcoredd_mutex); + + vmcoredd_update_size(data_size); + return 0; + +out_err: + if (buf) + vfree(buf); + + if (dump) + vfree(dump); + + return ret; +} +EXPORT_SYMBOL(vmcore_add_device_dump); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ + +/* Free all dumps in vmcore device dump list */ +static void vmcore_free_device_dumps(void) +{ +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP + mutex_lock(&vmcoredd_mutex); + while (!list_empty(&vmcoredd_list)) { + struct vmcoredd_node *dump; + + dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node, + list); + list_del(&dump->list); + vfree(dump->buf); + vfree(dump); + } + mutex_unlock(&vmcoredd_mutex); +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ +} + /* Init function for vmcore module. */ static int __init vmcore_init(void) { @@ -1192,4 +1545,7 @@ void vmcore_cleanup(void) kfree(m); } free_elfcorebuf(); + + /* clear vmcore device dump list */ + vmcore_free_device_dumps(); } diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 616a688f5d8f..55c508fe8131 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -24,14 +24,6 @@ static bool ubifs_crypt_empty_dir(struct inode *inode) return ubifs_check_dir_empty(inode) == 0; } -static unsigned int ubifs_crypt_max_namelen(struct inode *inode) -{ - if (S_ISLNK(inode->i_mode)) - return UBIFS_MAX_INO_DATA; - else - return UBIFS_MAX_NLEN; -} - int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, unsigned int in_len, unsigned int *out_len, int block) { @@ -89,5 +81,5 @@ const struct fscrypt_operations ubifs_crypt_operations = { .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, - .max_namelen = ubifs_crypt_max_namelen, + .max_namelen = UBIFS_MAX_NLEN, }; diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig index c6e17a744c3b..aa415054ad0a 100644 --- a/fs/udf/Kconfig +++ b/fs/udf/Kconfig @@ -1,6 +1,7 @@ config UDF_FS tristate "UDF file system support" select CRC_ITU_T + select NLS help This is a file system used on some CD-ROMs and DVDs. Since the file system is supported by multiple operating systems and is more @@ -13,8 +14,3 @@ config UDF_FS module will be called udf. If unsure, say N. - -config UDF_NLS - bool - default y - depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y) diff --git a/fs/udf/super.c b/fs/udf/super.c index 7949c338efa5..0d27d41f5c6e 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -572,7 +572,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt, case Opt_utf8: uopt->flags |= (1 << UDF_FLAG_UTF8); break; -#ifdef CONFIG_UDF_NLS case Opt_iocharset: if (!remount) { if (uopt->nls_map) @@ -581,7 +580,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt, uopt->flags |= (1 << UDF_FLAG_NLS_MAP); } break; -#endif case Opt_uforget: uopt->flags |= (1 << UDF_FLAG_UID_FORGET); break; @@ -892,14 +890,14 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) #endif } - ret = udf_dstrCS0toUTF8(outstr, 31, pvoldesc->volIdent, 32); + ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32); if (ret < 0) goto out_bh; strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret); udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); - ret = udf_dstrCS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128); + ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128); if (ret < 0) goto out_bh; @@ -2117,7 +2115,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) udf_err(sb, "utf8 cannot be combined with iocharset\n"); goto parse_options_failure; } -#ifdef CONFIG_UDF_NLS if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { uopt.nls_map = load_nls_default(); if (!uopt.nls_map) @@ -2125,7 +2122,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) else udf_debug("Using default NLS map\n"); } -#endif if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP))) uopt.flags |= (1 << UDF_FLAG_UTF8); @@ -2279,10 +2275,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) error_out: iput(sbi->s_vat_inode); parse_options_failure: -#ifdef CONFIG_UDF_NLS if (uopt.nls_map) unload_nls(uopt.nls_map); -#endif if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); @@ -2332,10 +2326,8 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); iput(sbi->s_vat_inode); -#ifdef CONFIG_UDF_NLS if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) unload_nls(sbi->s_nls_map); -#endif if (!sb_rdonly(sb)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 68e8a64d22e0..fc8d1b3384d2 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -220,7 +220,8 @@ extern int udf_get_filename(struct super_block *, const uint8_t *, int, uint8_t *, int); extern int udf_put_filename(struct super_block *, const uint8_t *, int, uint8_t *, int); -extern int udf_dstrCS0toUTF8(uint8_t *, int, const uint8_t *, int); +extern int udf_dstrCS0toChar(struct super_block *, uint8_t *, int, + const uint8_t *, int); /* ialloc.c */ extern void udf_free_inode(struct inode *); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 16a8ad21b77e..45234791fec2 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -28,101 +28,64 @@ #include "udf_sb.h" +#define PLANE_SIZE 0x10000 +#define UNICODE_MAX 0x10ffff #define SURROGATE_MASK 0xfffff800 #define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_CHAR_BITS 10 +#define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1) -static int udf_uni2char_utf8(wchar_t uni, - unsigned char *out, - int boundlen) -{ - int u_len = 0; - - if (boundlen <= 0) - return -ENAMETOOLONG; +#define ILLEGAL_CHAR_MARK '_' +#define EXT_MARK '.' +#define CRC_MARK '#' +#define EXT_SIZE 5 +/* Number of chars we need to store generated CRC to make filename unique */ +#define CRC_LEN 5 - if ((uni & SURROGATE_MASK) == SURROGATE_PAIR) - return -EINVAL; +static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len, + int str_i_idx, int u_ch, unicode_t *ret) +{ + unicode_t c; + int start_idx = str_i_idx; + + /* Expand OSTA compressed Unicode to Unicode */ + c = str_i[str_i_idx++]; + if (u_ch > 1) + c = (c << 8) | str_i[str_i_idx++]; + if ((c & SURROGATE_MASK) == SURROGATE_PAIR) { + unicode_t next; + + /* Trailing surrogate char */ + if (str_i_idx >= str_i_max_len) { + c = UNICODE_MAX + 1; + goto out; + } - if (uni < 0x80) { - out[u_len++] = (unsigned char)uni; - } else if (uni < 0x800) { - if (boundlen < 2) - return -ENAMETOOLONG; - out[u_len++] = (unsigned char)(0xc0 | (uni >> 6)); - out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); - } else { - if (boundlen < 3) - return -ENAMETOOLONG; - out[u_len++] = (unsigned char)(0xe0 | (uni >> 12)); - out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f)); - out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); - } - return u_len; -} + /* Low surrogate must follow the high one... */ + if (c & SURROGATE_LOW) { + c = UNICODE_MAX + 1; + goto out; + } -static int udf_char2uni_utf8(const unsigned char *in, - int boundlen, - wchar_t *uni) -{ - unsigned int utf_char; - unsigned char c; - int utf_cnt, u_len; - - utf_char = 0; - utf_cnt = 0; - for (u_len = 0; u_len < boundlen;) { - c = in[u_len++]; - - /* Complete a multi-byte UTF-8 character */ - if (utf_cnt) { - utf_char = (utf_char << 6) | (c & 0x3f); - if (--utf_cnt) - continue; - } else { - /* Check for a multi-byte UTF-8 character */ - if (c & 0x80) { - /* Start a multi-byte UTF-8 character */ - if ((c & 0xe0) == 0xc0) { - utf_char = c & 0x1f; - utf_cnt = 1; - } else if ((c & 0xf0) == 0xe0) { - utf_char = c & 0x0f; - utf_cnt = 2; - } else if ((c & 0xf8) == 0xf0) { - utf_char = c & 0x07; - utf_cnt = 3; - } else if ((c & 0xfc) == 0xf8) { - utf_char = c & 0x03; - utf_cnt = 4; - } else if ((c & 0xfe) == 0xfc) { - utf_char = c & 0x01; - utf_cnt = 5; - } else { - utf_cnt = -1; - break; - } - continue; - } else { - /* Single byte UTF-8 character (most common) */ - utf_char = c; - } + WARN_ON_ONCE(u_ch != 2); + next = str_i[str_i_idx++] << 8; + next |= str_i[str_i_idx++]; + if ((next & SURROGATE_MASK) != SURROGATE_PAIR || + !(next & SURROGATE_LOW)) { + c = UNICODE_MAX + 1; + goto out; } - *uni = utf_char; - break; - } - if (utf_cnt) { - *uni = '?'; - return -EINVAL; + + c = PLANE_SIZE + + ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) + + (next & SURROGATE_CHAR_MASK); } - return u_len; +out: + *ret = c; + return str_i_idx - start_idx; } -#define ILLEGAL_CHAR_MARK '_' -#define EXT_MARK '.' -#define CRC_MARK '#' -#define EXT_SIZE 5 -/* Number of chars we need to store generated CRC to make filename unique */ -#define CRC_LEN 5 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, int *str_o_idx, @@ -132,27 +95,29 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, int (*conv_f)(wchar_t, unsigned char *, int), int translate) { - uint32_t c; + unicode_t c; int illChar = 0; int len, gotch = 0; - for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) { + while (!gotch && *str_i_idx < str_i_max_len) { if (*str_o_idx >= str_o_max_len) { *needsCRC = 1; return gotch; } - /* Expand OSTA compressed Unicode to Unicode */ - c = str_i[*str_i_idx]; - if (u_ch > 1) - c = (c << 8) | str_i[*str_i_idx + 1]; - - if (translate && (c == '/' || c == 0)) + len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch, + &c); + /* These chars cannot be converted. Replace them. */ + if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) || + (translate && c == '/')) { illChar = 1; - else if (illChar) + if (!translate) + gotch = 1; + } else if (illChar) break; else gotch = 1; + *str_i_idx += len; } if (illChar) { *needsCRC = 1; @@ -160,7 +125,15 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, gotch = 1; } if (gotch) { - len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx); + if (conv_f) { + len = conv_f(c, &str_o[*str_o_idx], + str_o_max_len - *str_o_idx); + } else { + len = utf32_to_utf8(c, &str_o[*str_o_idx], + str_o_max_len - *str_o_idx); + if (len < 0) + len = -ENAMETOOLONG; + } /* Valid character? */ if (len >= 0) *str_o_idx += len; @@ -168,16 +141,16 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, *needsCRC = 1; gotch = 0; } else { - str_o[(*str_o_idx)++] = '?'; + str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK; *needsCRC = 1; } } return gotch; } -static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, +static int udf_name_from_CS0(struct super_block *sb, + uint8_t *str_o, int str_max_len, const uint8_t *ocu, int ocu_len, - int (*conv_f)(wchar_t, unsigned char *, int), int translate) { uint32_t c; @@ -194,6 +167,7 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, unsigned short valueCRC; uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; uint8_t crc[CRC_LEN]; + int (*conv_f)(wchar_t, unsigned char *, int); if (str_max_len <= 0) return 0; @@ -203,6 +177,11 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, return 0; } + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + conv_f = UDF_SB(sb)->s_nls_map->uni2char; + else + conv_f = NULL; + cmp_id = ocu[0]; if (cmp_id != 8 && cmp_id != 16) { memset(str_o, 0, str_max_len); @@ -293,18 +272,24 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, return str_o_len; } -static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len, - const uint8_t *str_i, int str_len, - int (*conv_f)(const unsigned char *, int, wchar_t *)) +static int udf_name_to_CS0(struct super_block *sb, + uint8_t *ocu, int ocu_max_len, + const uint8_t *str_i, int str_len) { int i, len; unsigned int max_val; - wchar_t uni_char; int u_len, u_ch; + unicode_t uni_char; + int (*conv_f)(const unsigned char *, int, wchar_t *); if (ocu_max_len <= 0) return 0; + if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + conv_f = UDF_SB(sb)->s_nls_map->char2uni; + else + conv_f = NULL; + memset(ocu, 0, ocu_max_len); ocu[0] = 8; max_val = 0xff; @@ -312,36 +297,61 @@ static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len, try_again: u_len = 1; - for (i = 0; i < str_len; i++) { + for (i = 0; i < str_len; i += len) { /* Name didn't fit? */ if (u_len + u_ch > ocu_max_len) return 0; - len = conv_f(&str_i[i], str_len - i, &uni_char); - if (!len) - continue; + if (conv_f) { + wchar_t wchar; + + len = conv_f(&str_i[i], str_len - i, &wchar); + if (len > 0) + uni_char = wchar; + } else { + len = utf8_to_utf32(&str_i[i], str_len - i, + &uni_char); + } /* Invalid character, deal with it */ - if (len < 0) { + if (len <= 0 || uni_char > UNICODE_MAX) { len = 1; uni_char = '?'; } if (uni_char > max_val) { - max_val = 0xffff; - ocu[0] = 0x10; - u_ch = 2; - goto try_again; + unicode_t c; + + if (max_val == 0xff) { + max_val = 0xffff; + ocu[0] = 0x10; + u_ch = 2; + goto try_again; + } + /* + * Use UTF-16 encoding for chars outside we + * cannot encode directly. + */ + if (u_len + 2 * u_ch > ocu_max_len) + return 0; + + uni_char -= PLANE_SIZE; + c = SURROGATE_PAIR | + ((uni_char >> SURROGATE_CHAR_BITS) & + SURROGATE_CHAR_MASK); + ocu[u_len++] = (uint8_t)(c >> 8); + ocu[u_len++] = (uint8_t)(c & 0xff); + uni_char = SURROGATE_PAIR | SURROGATE_LOW | + (uni_char & SURROGATE_CHAR_MASK); } if (max_val == 0xffff) ocu[u_len++] = (uint8_t)(uni_char >> 8); ocu[u_len++] = (uint8_t)(uni_char & 0xff); - i += len - 1; } return u_len; } -int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, +int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, const uint8_t *ocu_i, int i_len) { int s_len = 0; @@ -355,14 +365,12 @@ int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, } } - return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, - udf_uni2char_utf8, 0); + return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0); } int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, uint8_t *dname, int dlen) { - int (*conv_f)(wchar_t, unsigned char *, int); int ret; if (!slen) @@ -371,14 +379,7 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, if (dlen <= 0) return 0; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { - conv_f = udf_uni2char_utf8; - } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { - conv_f = UDF_SB(sb)->s_nls_map->uni2char; - } else - BUG(); - - ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1); + ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1); /* Zero length filename isn't valid... */ if (ret == 0) ret = -EINVAL; @@ -388,15 +389,6 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, uint8_t *dname, int dlen) { - int (*conv_f)(const unsigned char *, int, wchar_t *); - - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { - conv_f = udf_char2uni_utf8; - } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { - conv_f = UDF_SB(sb)->s_nls_map->char2uni; - } else - BUG(); - - return udf_name_to_CS0(dname, dlen, sname, slen, conv_f); + return udf_name_to_CS0(sb, dname, dlen, sname, slen); } diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cec550c8468f..123bf7d516fc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -62,6 +62,8 @@ struct userfaultfd_ctx { enum userfaultfd_state state; /* released */ bool released; + /* memory mappings are changing because of non-cooperative event */ + bool mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: + WRITE_ONCE(ctx->mmap_changing, false); userfaultfd_ctx_put(ctx); } @@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); + WRITE_ONCE(octx->mmap_changing, true); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); } } @@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); up_read(&mm->mmap_sem); msg_init(&ewq.msg); @@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, /* don't copy "copy" last field */ @@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len); + uffdio_copy.len, &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, /* don't copy "zeropage" last field */ @@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (mmget_not_zero(ctx->mm)) { ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 46bcf0e649f5..457ac9f97377 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -85,6 +85,24 @@ config XFS_ONLINE_SCRUB If unsure, say N. +config XFS_ONLINE_REPAIR + bool "XFS online metadata repair support" + default n + depends on XFS_FS && XFS_ONLINE_SCRUB + help + If you say Y here you will be able to repair metadata on a + mounted XFS filesystem. This feature is intended to reduce + filesystem downtime by fixing minor problems before they cause the + filesystem to go down. However, it requires that the filesystem be + formatted with secondary metadata, such as reverse mappings and inode + parent pointers. + + This feature is considered EXPERIMENTAL. Use with caution! + + See the xfs_scrub man page in section 8 for additional information. + + If unsure, say N. + config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7ceb41a9786a..e8d67a443bd7 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -28,6 +28,7 @@ xfs-y += xfs_trace.o # build the libxfs code first xfs-y += $(addprefix libxfs/, \ + xfs_ag.o \ xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ @@ -163,4 +164,12 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o + +# online repair +ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) +xfs-y += $(addprefix scrub/, \ + agheader_repair.o \ + repair.o \ + ) +endif endif diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c new file mode 100644 index 000000000000..9345802c99f7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag.c @@ -0,0 +1,464 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2018 Red Hat, Inc. + * All rights reserved. + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" + +static struct xfs_buf * +xfs_get_aghdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; + + return bp; +} + +/* + * Generic btree root block init function + */ +static void +xfs_btroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0); +} + +/* + * Alloc btree root block init functions + */ +static void +xfs_bnoroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_alloc_rec *arec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); + arec->ar_blockcount = cpu_to_be32(id->agsize - + be32_to_cpu(arec->ar_startblock)); +} + +static void +xfs_cntroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_alloc_rec *arec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); + arec->ar_blockcount = cpu_to_be32(id->agsize - + be32_to_cpu(arec->ar_startblock)); +} + +/* + * Reverse map root block init + */ +static void +xfs_rmaproot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_rmap_rec *rrec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0); + + /* + * mark the AG header regions as static metadata The BNO + * btree block is the first block after the headers, so + * it's location defines the size of region the static + * metadata consumes. + * + * Note: unlike mkfs, we never have to account for log + * space when growing the data regions + */ + rrec = XFS_RMAP_REC_ADDR(block, 1); + rrec->rm_startblock = 0; + rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); + rrec->rm_offset = 0; + + /* account freespace btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 2); + rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(2); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + + /* account inode btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 3); + rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - + XFS_IBT_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); + rrec->rm_offset = 0; + + /* account for rmap btree root */ + rrec = XFS_RMAP_REC_ADDR(block, 4); + rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + + /* account for refc btree root */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + rrec = XFS_RMAP_REC_ADDR(block, 5); + rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + } +} + +/* + * Initialise new secondary superblocks with the pre-grow geometry, but mark + * them as "in progress" so we know they haven't yet been activated. This will + * get cleared when the update with the new geometry information is done after + * changes to the primary are committed. This isn't strictly necessary, but we + * get it for free with the delayed buffer write lists and it means we can tell + * if a grow operation didn't complete properly after the fact. + */ +static void +xfs_sbblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + + xfs_sb_to_disk(dsb, &mp->m_sb); + dsb->sb_inprogress = 1; +} + +static void +xfs_agfblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + xfs_extlen_t tmpsize; + + agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); + agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); + agf->agf_seqno = cpu_to_be32(id->agno); + agf->agf_length = cpu_to_be32(id->agsize); + agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); + agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + agf->agf_roots[XFS_BTNUM_RMAPi] = + cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + agf->agf_rmap_blocks = cpu_to_be32(1); + } + + agf->agf_flfirst = cpu_to_be32(1); + agf->agf_fllast = 0; + agf->agf_flcount = 0; + tmpsize = id->agsize - mp->m_ag_prealloc_blocks; + agf->agf_freeblks = cpu_to_be32(tmpsize); + agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agf->agf_refcount_root = cpu_to_be32( + xfs_refc_block(mp)); + agf->agf_refcount_level = cpu_to_be32(1); + agf->agf_refcount_blocks = cpu_to_be32(1); + } +} + +static void +xfs_agflblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno; + int bucket; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(id->agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); + } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); +} + +static void +xfs_agiblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int bucket; + + agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); + agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); + agi->agi_seqno = cpu_to_be32(id->agno); + agi->agi_length = cpu_to_be32(id->agsize); + agi->agi_count = 0; + agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); + agi->agi_level = cpu_to_be32(1); + agi->agi_freecount = 0; + agi->agi_newino = cpu_to_be32(NULLAGINO); + agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); +} + +typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, + struct aghdr_init_data *id); +static int +xfs_ag_init_hdr( + struct xfs_mount *mp, + struct aghdr_init_data *id, + aghdr_init_work_f work, + const struct xfs_buf_ops *ops) + +{ + struct xfs_buf *bp; + + bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops); + if (!bp) + return -ENOMEM; + + (*work)(mp, bp, id); + + xfs_buf_delwri_queue(bp, &id->buffer_list); + xfs_buf_relse(bp); + return 0; +} + +struct xfs_aghdr_grow_data { + xfs_daddr_t daddr; + size_t numblks; + const struct xfs_buf_ops *ops; + aghdr_init_work_f work; + xfs_btnum_t type; + bool need_init; +}; + +/* + * Prepare new AG headers to be written to disk. We use uncached buffers here, + * as it is assumed these new AG headers are currently beyond the currently + * valid filesystem address space. Using cached buffers would trip over EOFS + * corruption detection alogrithms in the buffer cache lookup routines. + * + * This is a non-transactional function, but the prepared buffers are added to a + * delayed write buffer list supplied by the caller so they can submit them to + * disk and wait on them as required. + */ +int +xfs_ag_init_headers( + struct xfs_mount *mp, + struct aghdr_init_data *id) + +{ + struct xfs_aghdr_grow_data aghdr_data[] = { + { /* SB */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_sb_buf_ops, + .work = &xfs_sbblock_init, + .need_init = true + }, + { /* AGF */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agf_buf_ops, + .work = &xfs_agfblock_init, + .need_init = true + }, + { /* AGFL */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agfl_buf_ops, + .work = &xfs_agflblock_init, + .need_init = true + }, + { /* AGI */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agi_buf_ops, + .work = &xfs_agiblock_init, + .need_init = true + }, + { /* BNO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_allocbt_buf_ops, + .work = &xfs_bnoroot_init, + .need_init = true + }, + { /* CNT root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_allocbt_buf_ops, + .work = &xfs_cntroot_init, + .need_init = true + }, + { /* INO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_inobt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_INO, + .need_init = true + }, + { /* FINO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_inobt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_FINO, + .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) + }, + { /* RMAP root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_rmapbt_buf_ops, + .work = &xfs_rmaproot_init, + .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb) + }, + { /* REFC root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_refcountbt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_REFC, + .need_init = xfs_sb_version_hasreflink(&mp->m_sb) + }, + { /* NULL terminating block */ + .daddr = XFS_BUF_DADDR_NULL, + } + }; + struct xfs_aghdr_grow_data *dp; + int error = 0; + + /* Account for AG free space in new AG */ + id->nfree += id->agsize - mp->m_ag_prealloc_blocks; + for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) { + if (!dp->need_init) + continue; + + id->daddr = dp->daddr; + id->numblks = dp->numblks; + id->type = dp->type; + error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops); + if (error) + break; + } + return error; +} + +/* + * Extent the AG indicated by the @id by the length passed in + */ +int +xfs_ag_extend_space( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct aghdr_init_data *id, + xfs_extlen_t len) +{ + struct xfs_owner_info oinfo; + struct xfs_buf *bp; + struct xfs_agi *agi; + struct xfs_agf *agf; + int error; + + /* + * Change the agi length. + */ + error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(bp); + be32_add_cpu(&agi->agi_length, len); + ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || + be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); + xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); + + /* + * Change agf length. + */ + error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(bp); + be32_add_cpu(&agf->agf_length, len); + ASSERT(agf->agf_length == agi->agi_length); + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + + /* + * Free the new space. + * + * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that + * this doesn't actually exist in the rmap btree. + */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_rmap_free(tp, bp, id->agno, + be32_to_cpu(agf->agf_length) - len, + len, &oinfo); + if (error) + return error; + + return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, + be32_to_cpu(agf->agf_length) - len), + len, &oinfo, XFS_AG_RESV_NONE); +} diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h new file mode 100644 index 000000000000..412702e23f61 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 Red Hat, Inc. + * All rights reserved. + */ + +#ifndef __LIBXFS_AG_H +#define __LIBXFS_AG_H 1 + +struct xfs_mount; +struct xfs_trans; + +struct aghdr_init_data { + /* per ag data */ + xfs_agblock_t agno; /* ag to init */ + xfs_extlen_t agsize; /* new AG size */ + struct list_head buffer_list; /* buffer writeback list */ + xfs_rfsblock_t nfree; /* cumulative new free space */ + + /* per header data */ + xfs_daddr_t daddr; /* header location */ + size_t numblks; /* size of header */ + xfs_btnum_t type; /* type of btree root block */ +}; + +int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); +int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, + struct aghdr_init_data *id, xfs_extlen_t len); + +#endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 4bcc095fe44a..dc9dd3805d97 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -39,6 +39,9 @@ #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_ag_resv.h" +#include "xfs_bmap.h" + +extern kmem_zone_t *xfs_bmap_free_item_zone; struct workqueue_struct *xfs_alloc_wq; @@ -2060,6 +2063,30 @@ xfs_alloc_space_available( return true; } +int +xfs_free_agfl_block( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + struct xfs_buf *agbp, + struct xfs_owner_info *oinfo) +{ + int error; + struct xfs_buf *bp; + + error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, + XFS_AG_RESV_AGFL); + if (error) + return error; + + bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0); + if (!bp) + return -EFSCORRUPTED; + xfs_trans_binval(tp, bp); + + return 0; +} + /* * Check the agfl fields of the agf for inconsistency or corruption. The purpose * is to detect an agfl header padding mismatch between current and early v5 @@ -2148,6 +2175,40 @@ xfs_agfl_reset( } /* + * Defer an AGFL block free. This is effectively equivalent to + * xfs_bmap_add_free() with some special handling particular to AGFL blocks. + * + * Deferring AGFL frees helps prevent log reservation overruns due to too many + * allocation operations in a transaction. AGFL frees are prone to this problem + * because for one they are always freed one at a time. Further, an immediate + * AGFL block free can cause a btree join and require another block free before + * the real allocation can proceed. Deferring the free disconnects freeing up + * the AGFL slot from freeing the block. + */ +STATIC void +xfs_defer_agfl_block( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, + xfs_fsblock_t agbno, + struct xfs_owner_info *oinfo) +{ + struct xfs_extent_free_item *new; /* new element */ + + ASSERT(xfs_bmap_free_item_zone != NULL); + ASSERT(oinfo != NULL); + + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + new->xefi_blockcount = 1; + new->xefi_oinfo = *oinfo; + + trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); +} + +/* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ @@ -2247,21 +2308,20 @@ xfs_alloc_fix_freelist( else xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG); while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { - struct xfs_buf *bp; - error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) goto out_agbp_relse; - error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, XFS_AG_RESV_AGFL); - if (error) - goto out_agbp_relse; - bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); - if (!bp) { - error = -EFSCORRUPTED; - goto out_agbp_relse; + + /* defer agfl frees if dfops is provided */ + if (tp->t_agfl_dfops) { + xfs_defer_agfl_block(mp, tp->t_agfl_dfops, args->agno, + bno, &targs.oinfo); + } else { + error = xfs_free_agfl_block(tp, args->agno, bno, agbp, + &targs.oinfo); + if (error) + goto out_agbp_relse; } - xfs_trans_binval(tp, bp); } targs.tp = tp; @@ -2949,18 +3009,20 @@ out: * after fixing up the freelist. */ int /* error */ -xfs_free_extent( +__xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ struct xfs_owner_info *oinfo, /* extent owner */ - enum xfs_ag_resv_type type) /* block reservation type */ + enum xfs_ag_resv_type type, /* block reservation type */ + bool skip_discard) { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); int error; + unsigned int busy_flags = 0; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -2984,7 +3046,9 @@ xfs_free_extent( if (error) goto err; - xfs_extent_busy_insert(tp, agno, agbno, len, 0); + if (skip_discard) + busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; + xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags); return 0; err: @@ -3116,3 +3180,40 @@ xfs_alloc_has_record( return xfs_btree_has_record(cur, &low, &high, exists); } + +/* + * Walk all the blocks in the AGFL. The @walk_fn can return any negative + * error code or XFS_BTREE_QUERY_RANGE_ABORT. + */ +int +xfs_agfl_walk( + struct xfs_mount *mp, + struct xfs_agf *agf, + struct xfs_buf *agflbp, + xfs_agfl_walk_fn walk_fn, + void *priv) +{ + __be32 *agfl_bno; + unsigned int i; + int error; + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + i = be32_to_cpu(agf->agf_flfirst); + + /* Nothing to walk in an empty AGFL. */ + if (agf->agf_flcount == cpu_to_be32(0)) + return 0; + + /* Otherwise, walk from first to last, wrapping as needed. */ + for (;;) { + error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (i == be32_to_cpu(agf->agf_fllast)) + break; + if (++i == xfs_agfl_size(mp)) + i = 0; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index cbf789ea5a4e..0747adcd57d6 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -191,12 +191,24 @@ xfs_alloc_vextent( * Free an extent. */ int /* error */ -xfs_free_extent( +__xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ struct xfs_owner_info *oinfo, /* extent owner */ - enum xfs_ag_resv_type type); /* block reservation type */ + enum xfs_ag_resv_type type, /* block reservation type */ + bool skip_discard); + +static inline int +xfs_free_extent( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + return __xfs_free_extent(tp, bno, len, oinfo, type, false); +} int /* error */ xfs_alloc_lookup_le( @@ -223,6 +235,8 @@ int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, + struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **agbp); @@ -248,4 +262,9 @@ bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exist); +typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, + void *priv); +int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf, + struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index b451649ba176..18aec7a0e599 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -547,3 +547,12 @@ xfs_allocbt_maxrecs( return blocklen / sizeof(xfs_alloc_rec_t); return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); } + +/* Calculate the freespace btree size for some records. */ +xfs_extlen_t +xfs_allocbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_alloc_mnr, len); +} diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 45e189e7e81c..2fd54728871c 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -61,5 +61,7 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, + unsigned long long len); #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 35a124400d60..c3d02a66d39d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -236,7 +236,7 @@ xfs_attr_set( args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; args.total = xfs_attr_calc_size(&args, &local); - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) return error; @@ -427,7 +427,7 @@ xfs_attr_remove( */ args.op_flags = XFS_DA_OP_OKNOENT; - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 21be186067a2..83a6d3c7f872 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -620,7 +620,7 @@ xfs_attr_rmtval_remove( /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); + bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 040eeda8426f..7b0e2b551e23 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -246,7 +246,7 @@ xfs_bmap_get_bp( struct xfs_btree_cur *cur, xfs_fsblock_t bno) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; int i; if (!cur) @@ -260,9 +260,9 @@ xfs_bmap_get_bp( } /* Chase down all the log items to see if the bp is there */ - list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { - struct xfs_buf_log_item *bip; - bip = (struct xfs_buf_log_item *)lidp->lid_item; + list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) { + struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; + if (bip->bli_item.li_type == XFS_LI_BUF && XFS_BUF_ADDR(bip->bli_buf) == bno) return bip->bli_buf; @@ -312,8 +312,9 @@ xfs_check_block( xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); - panic("%s: ptrs are equal in node\n", + xfs_err(mp, "%s: ptrs are equal in node\n", __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } } } @@ -483,7 +484,8 @@ error0: error_norelse: xfs_warn(mp, "%s: BAD after btree leaves for %d extents", __func__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return; } @@ -542,12 +544,13 @@ xfs_bmap_validate_ret( * The list is maintained sorted (by block number). */ void -xfs_bmap_add_free( +__xfs_bmap_add_free( struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo) + struct xfs_owner_info *oinfo, + bool skip_discard) { struct xfs_extent_free_item *new; /* new element */ #ifdef DEBUG @@ -574,6 +577,7 @@ xfs_bmap_add_free( new->xefi_oinfo = *oinfo; else xfs_rmap_skip_owner_update(&new->xefi_oinfo); + new->xefi_skip_discard = skip_discard; trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0, XFS_FSB_TO_AGBNO(mp, bno), len); xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); @@ -2001,10 +2005,13 @@ xfs_bmap_add_extent_delay_real( ASSERT(0); } - /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); - if (error) - goto done; + /* add reverse mapping unless caller opted out */ + if (!(bma->flags & XFS_BMAPI_NORMAP)) { + error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, + whichfork, new); + if (error) + goto done; + } /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -2668,7 +2675,8 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec *new, xfs_fsblock_t *first, struct xfs_defer_ops *dfops, - int *logflagsp) + int *logflagsp, + int flags) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; @@ -2845,10 +2853,12 @@ xfs_bmap_add_extent_hole_real( break; } - /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); - if (error) - goto done; + /* add reverse mapping unless caller opted out */ + if (!(flags & XFS_BMAPI_NORMAP)) { + error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); + if (error) + goto done; + } /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -4123,7 +4133,8 @@ xfs_bmapi_allocate( else error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, &bma->got, - bma->firstblock, bma->dfops, &bma->logflags); + bma->firstblock, bma->dfops, &bma->logflags, + bma->flags); bma->logflags |= tmp_logflags; if (error) @@ -4509,30 +4520,37 @@ error0: return error; } -static int +int xfs_bmapi_remap( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - struct xfs_defer_ops *dfops) + struct xfs_defer_ops *dfops, + int flags) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp; struct xfs_btree_cur *cur = NULL; xfs_fsblock_t firstblock = NULLFSBLOCK; struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; + int whichfork = xfs_bmapi_whichfork(flags); int logflags = 0, error; + ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | + XFS_BMAPI_NORMAP))); + ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != + (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; @@ -4542,7 +4560,7 @@ xfs_bmapi_remap( return -EIO; if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; } @@ -4557,7 +4575,7 @@ xfs_bmapi_remap( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (ifp->if_flags & XFS_IFBROOT) { - cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = firstblock; cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; @@ -4566,18 +4584,21 @@ xfs_bmapi_remap( got.br_startoff = bno; got.br_startblock = startblock; got.br_blockcount = len; - got.br_state = XFS_EXT_NORM; + if (flags & XFS_BMAPI_PREALLOC) + got.br_state = XFS_EXT_UNWRITTEN; + else + got.br_state = XFS_EXT_NORM; - error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur, - &cur, &got, &firstblock, dfops, &logflags); + error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur, + &cur, &got, &firstblock, dfops, &logflags, flags); if (error) goto error0; - if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) { + if (xfs_bmap_wants_extents(ip, whichfork)) { int tmp_logflags = 0; error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, XFS_DATA_FORK); + &tmp_logflags, whichfork); logflags |= tmp_logflags; } @@ -5104,9 +5125,12 @@ xfs_bmap_del_extent_real( error = xfs_refcount_decrease_extent(mp, dfops, del); if (error) goto done; - } else - xfs_bmap_add_free(mp, dfops, del->br_startblock, - del->br_blockcount, NULL); + } else { + __xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL, + (bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN); + } } /* @@ -6148,7 +6172,7 @@ xfs_bmap_finish_one( switch (type) { case XFS_BMAP_MAP: error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, - startblock, dfops); + startblock, dfops, 0); *blockcount = 0; break; case XFS_BMAP_UNMAP: diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 2b766b37096d..2c233f9f1a26 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -68,6 +68,7 @@ struct xfs_extent_free_item xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ struct list_head xefi_list; struct xfs_owner_info xefi_oinfo; /* extent owner */ + bool xefi_skip_discard; }; #define XFS_BMAP_MAX_NMAP 4 @@ -116,6 +117,12 @@ struct xfs_extent_free_item /* Only convert unwritten extents, don't allocate new blocks */ #define XFS_BMAPI_CONVERT_ONLY 0x800 +/* Skip online discard of freed extents */ +#define XFS_BMAPI_NODISCARD 0x1000 + +/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ +#define XFS_BMAPI_NORMAP 0x2000 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -128,7 +135,9 @@ struct xfs_extent_free_item { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" } + { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ + { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ + { XFS_BMAPI_NORMAP, "NORMAP" } static inline int xfs_bmapi_aflag(int w) @@ -192,9 +201,9 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, +void __xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo); + struct xfs_owner_info *oinfo, bool skip_discard); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -240,6 +249,17 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +static inline void +xfs_bmap_add_free( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t bno, + xfs_filblks_t len, + struct xfs_owner_info *oinfo) +{ + __xfs_bmap_add_free(mp, dfops, bno, len, oinfo, false); +} + enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, @@ -277,4 +297,8 @@ static inline int xfs_bmap_fork_to_state(int whichfork) xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); +int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, + struct xfs_defer_ops *dfops, int flags); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d89d06bea6e3..ac9d4aeedb09 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -660,3 +660,12 @@ xfs_bmbt_change_owner( xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); return error; } + +/* Calculate the bmap btree size for some records. */ +unsigned long long +xfs_bmbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_bmap_dmnr, len); +} diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index e4505746ccaa..fb3cd2d9e0f8 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -118,4 +118,7 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ac7d66427e42..c825c8182b30 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4836,14 +4836,14 @@ xfs_btree_query_all( * Calculate the number of blocks needed to store a given number of records * in a short-format (per-AG metadata) btree. */ -xfs_extlen_t +unsigned long long xfs_btree_calc_size( uint *limits, unsigned long long len) { int level; int maxrecs; - xfs_extlen_t rval; + unsigned long long rval; maxrecs = limits[0]; for (level = 0, rval = 0; len > 1; level++) { @@ -4919,3 +4919,24 @@ xfs_btree_has_record( *exists = false; return error; } + +/* Are there more records in this btree? */ +bool +xfs_btree_has_more_records( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, 0, &bp); + + /* There are still records in this block. */ + if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block)) + return true; + + /* There are more record blocks. */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); + else + return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 9227159a751e..d7911efee6dc 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -482,7 +482,7 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); -xfs_extlen_t xfs_btree_calc_size(uint *limits, unsigned long long len); +unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ @@ -528,5 +528,6 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); +bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 087fea02c389..3daf175e2535 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -220,7 +220,7 @@ xfs_defer_trans_abort( { struct xfs_defer_pending *dfp; - trace_xfs_defer_trans_abort(tp->t_mountp, dop); + trace_xfs_defer_trans_abort(tp->t_mountp, dop, _RET_IP_); /* Abort intent items that don't have a done item. */ list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { @@ -253,7 +253,7 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]); - trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop, _RET_IP_); /* Roll the transaction. */ error = xfs_trans_roll(tp); @@ -352,10 +352,21 @@ xfs_defer_finish( void *state; int error = 0; void (*cleanup_fn)(struct xfs_trans *, void *, int); + struct xfs_defer_ops *orig_dop; ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - trace_xfs_defer_finish((*tp)->t_mountp, dop); + trace_xfs_defer_finish((*tp)->t_mountp, dop, _RET_IP_); + + /* + * Attach dfops to the transaction during deferred ops processing. This + * explicitly causes calls into the allocator to defer AGFL block frees. + * Note that this code can go away once all dfops users attach to the + * associated tp. + */ + ASSERT(!(*tp)->t_agfl_dfops || ((*tp)->t_agfl_dfops == dop)); + orig_dop = (*tp)->t_agfl_dfops; + (*tp)->t_agfl_dfops = dop; /* Until we run out of pending work to finish... */ while (xfs_defer_has_unfinished_work(dop)) { @@ -428,10 +439,11 @@ xfs_defer_finish( } out: + (*tp)->t_agfl_dfops = orig_dop; if (error) trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error); else - trace_xfs_defer_finish_done((*tp)->t_mountp, dop); + trace_xfs_defer_finish_done((*tp)->t_mountp, dop, _RET_IP_); return error; } @@ -447,7 +459,7 @@ xfs_defer_cancel( struct list_head *pwi; struct list_head *n; - trace_xfs_defer_cancel(NULL, dop); + trace_xfs_defer_cancel(NULL, dop, _RET_IP_); /* * Free the pending items. Caller should already have arranged @@ -532,5 +544,5 @@ xfs_defer_init( *fbp = NULLFSBLOCK; INIT_LIST_HEAD(&dop->dop_intake); INIT_LIST_HEAD(&dop->dop_pending); - trace_xfs_defer_init(NULL, dop); + trace_xfs_defer_init(NULL, dop, _RET_IP_); } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 045beacdd37d..e70725ba1f5f 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -55,6 +55,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_REFCOUNT, XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, + XFS_DEFER_OPS_TYPE_AGFL_FREE, XFS_DEFER_OPS_TYPE_MAX, }; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 8b7a6c3cb599..cce520becee4 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -41,14 +41,18 @@ xfs_calc_dquots_per_chunk( /* * Do some primitive error checking on ondisk dquot data structures. + * + * The xfs_dqblk structure /contains/ the xfs_disk_dquot structure; + * we verify them separately because at some points we have only the + * smaller xfs_disk_dquot structure available. */ + xfs_failaddr_t xfs_dquot_verify( struct xfs_mount *mp, xfs_disk_dquot_t *ddq, xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags) + uint type) /* used only during quotacheck */ { /* * We can encounter an uninitialized dquot buffer for 2 reasons: @@ -70,6 +74,8 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; + if (type && ddq->d_flags != type) + return __this_address; if (ddq->d_flags != XFS_DQ_USER && ddq->d_flags != XFS_DQ_PROJ && ddq->d_flags != XFS_DQ_GROUP) @@ -99,33 +105,44 @@ xfs_dquot_verify( return NULL; } +xfs_failaddr_t +xfs_dqblk_verify( + struct xfs_mount *mp, + struct xfs_dqblk *dqb, + xfs_dqid_t id, + uint type) /* used only during quotacheck */ +{ + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + + return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type); +} + /* * Do some primitive error checking on ondisk dquot data structures. */ int -xfs_dquot_repair( +xfs_dqblk_repair( struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, + struct xfs_dqblk *dqb, xfs_dqid_t id, uint type) { - struct xfs_dqblk *d = (struct xfs_dqblk *)ddq; - - /* * Typically, a repair is only requested by quotacheck. */ ASSERT(id != -1); - memset(d, 0, sizeof(xfs_dqblk_t)); + memset(dqb, 0, sizeof(xfs_dqblk_t)); - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); + dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; + dqb->dd_diskdq.d_flags = type; + dqb->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); - xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -135,7 +152,8 @@ xfs_dquot_repair( STATIC bool xfs_dquot_buf_verify_crc( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + bool readahead) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; int ndquots; @@ -156,10 +174,12 @@ xfs_dquot_buf_verify_crc( for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), - XFS_DQUOT_CRC_OFF)) - return false; - if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid)) + XFS_DQUOT_CRC_OFF)) { + if (!readahead) + xfs_buf_verifier_error(bp, -EFSBADCRC, __func__, + d, sizeof(*d), __this_address); return false; + } } return true; } @@ -167,9 +187,10 @@ xfs_dquot_buf_verify_crc( STATIC xfs_failaddr_t xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + bool readahead) { - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_dqblk *dqb = bp->b_addr; xfs_failaddr_t fa; xfs_dqid_t id = 0; int ndquots; @@ -195,14 +216,19 @@ xfs_dquot_buf_verify( for (i = 0; i < ndquots; i++) { struct xfs_disk_dquot *ddq; - ddq = &d[i].dd_diskdq; + ddq = &dqb[i].dd_diskdq; if (i == 0) id = be32_to_cpu(ddq->d_id); - fa = xfs_dquot_verify(mp, ddq, id + i, 0, 0); - if (fa) + fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0); + if (fa) { + if (!readahead) + xfs_buf_verifier_error(bp, -EFSCORRUPTED, + __func__, &dqb[i], + sizeof(struct xfs_dqblk), fa); return fa; + } } return NULL; @@ -214,7 +240,7 @@ xfs_dquot_buf_verify_struct( { struct xfs_mount *mp = bp->b_target->bt_mount; - return xfs_dquot_buf_verify(mp, bp); + return xfs_dquot_buf_verify(mp, bp, false); } static void @@ -222,15 +248,10 @@ xfs_dquot_buf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_failaddr_t fa; - if (!xfs_dquot_buf_verify_crc(mp, bp)) - xfs_verifier_error(bp, -EFSBADCRC, __this_address); - else { - fa = xfs_dquot_buf_verify(mp, bp); - if (fa) - xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); - } + if (!xfs_dquot_buf_verify_crc(mp, bp, false)) + return; + xfs_dquot_buf_verify(mp, bp, false); } /* @@ -245,8 +266,8 @@ xfs_dquot_buf_readahead_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify_crc(mp, bp) || - xfs_dquot_buf_verify(mp, bp) != NULL) { + if (!xfs_dquot_buf_verify_crc(mp, bp, true) || + xfs_dquot_buf_verify(mp, bp, true) != NULL) { xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; } @@ -262,11 +283,8 @@ xfs_dquot_buf_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_failaddr_t fa; - fa = xfs_dquot_buf_verify(mp, bp); - if (fa) - xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); + xfs_dquot_buf_verify(mp, bp, false); } const struct xfs_buf_ops xfs_dquot_buf_ops = { diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index bc1789d95152..d47b91625945 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -65,7 +65,8 @@ #define XFS_ERRTAG_LOG_BAD_CRC 29 #define XFS_ERRTAG_LOG_ITEM_PIN 30 #define XFS_ERRTAG_BUF_LRU_REF 31 -#define XFS_ERRTAG_MAX 32 +#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 +#define XFS_ERRTAG_MAX 33 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -102,5 +103,6 @@ #define XFS_RANDOM_LOG_BAD_CRC 1 #define XFS_RANDOM_LOG_ITEM_PIN 1 #define XFS_RANDOM_BUF_LRU_REF 2 +#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 42956d8d95ed..c1cb29a5f4f6 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -98,6 +98,9 @@ struct xfs_ifork; XFS_SB_VERSION2_PROJID32BIT | \ XFS_SB_VERSION2_FTYPE) +/* Maximum size of the xfs filesystem label, no terminating NULL */ +#define XFSLABEL_MAX 12 + /* * Superblock - in core version. Must match the ondisk version below. * Must be padded to 64 bit alignment. @@ -122,7 +125,7 @@ typedef struct xfs_sb { uint16_t sb_sectsize; /* volume sector size, bytes */ uint16_t sb_inodesize; /* inode size, bytes */ uint16_t sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ + char sb_fname[XFSLABEL_MAX]; /* file system name */ uint8_t sb_blocklog; /* log2 of sb_blocksize */ uint8_t sb_sectlog; /* log2 of sb_sectsize */ uint8_t sb_inodelog; /* log2 of sb_inodesize */ @@ -213,7 +216,7 @@ typedef struct xfs_dsb { __be16 sb_sectsize; /* volume sector size, bytes */ __be16 sb_inodesize; /* inode size, bytes */ __be16 sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ + char sb_fname[XFSLABEL_MAX]; /* file system name */ __u8 sb_blocklog; /* log2 of sb_blocksize */ __u8 sb_sectlog; /* log2 of sb_sectsize */ __u8 sb_inodelog; /* log2 of sb_inodesize */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index faf1a4edd618..dddc75e4f1f6 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -542,13 +542,20 @@ struct xfs_scrub_metadata { /* o: Metadata object looked funny but isn't corrupt. */ #define XFS_SCRUB_OFLAG_WARNING (1 << 6) +/* + * o: IFLAG_REPAIR was set but metadata object did not need fixing or + * optimization and has therefore not been altered. + */ +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) + #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ XFS_SCRUB_OFLAG_PREEN | \ XFS_SCRUB_OFLAG_XFAIL | \ XFS_SCRUB_OFLAG_XCORRUPT | \ XFS_SCRUB_OFLAG_INCOMPLETE | \ - XFS_SCRUB_OFLAG_WARNING) + XFS_SCRUB_OFLAG_WARNING | \ + XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED) #define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) /* diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index de627fa19168..4ca4ff7a757d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -148,7 +148,7 @@ xfs_inobt_get_rec( /* * Insert a single inobt record. Cursor must already point to desired location. */ -STATIC int +int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, uint16_t holemask, diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index c5402bb4ce0c..77fffced8bac 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -176,6 +176,9 @@ int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low, xfs_agino_t high, bool *exists); int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount); +int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, + uint8_t count, int32_t freecount, xfs_inofree_t free, + int *stat); int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 367e9a0726e6..b04c55512159 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -296,7 +296,7 @@ xfs_inobt_verify( case cpu_to_be32(XFS_FIBT_MAGIC): break; default: - return NULL; + return __this_address; } /* level verification */ @@ -608,3 +608,12 @@ xfs_finobt_calc_reserves( *used += tree_len; return 0; } + +/* Calculate the inobt btree size for some records. */ +xfs_extlen_t +xfs_iallocbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_inobt_mnr, len); +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index aa81e2e63f3f..4acdd5458d59 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -74,5 +74,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); +extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, + unsigned long long len); #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index bb1b13a9b5f4..d4af2804b178 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -107,14 +107,12 @@ typedef uint16_t xfs_qwarncnt_t; * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ #define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ -#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be @@ -152,10 +150,11 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type, - uint flags); + struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type); +extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, + struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); -extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, +extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 560e28473024..418d53295893 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -88,8 +88,25 @@ xfs_refcount_lookup_ge( return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } +/* + * Look up the first record equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_eq( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_LE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + /* Convert on-disk record to in-core format. */ -static inline void +void xfs_refcount_btrec_to_irec( union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) @@ -149,7 +166,7 @@ xfs_refcount_update( * by [bno, len, refcount]. * This either works (return 0) or gets an EFSCORRUPTED error. */ -STATIC int +int xfs_refcount_insert( struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, @@ -162,7 +179,10 @@ xfs_refcount_insert( cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; error = xfs_btree_insert(cur, i); + if (error) + goto out_error; XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 2a731ac68fe4..a92ad9078bc1 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -24,6 +24,8 @@ extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno, int *stat); +extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); @@ -85,5 +87,10 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); +union xfs_btree_rec; +extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec, + struct xfs_refcount_irec *irec); +extern int xfs_refcount_insert(struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, int *stat); #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index fba8d2718017..c0644f1be8a8 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -1374,6 +1374,8 @@ xfs_rmap_convert_shared( */ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, &PREV, &i); + if (error) + goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); ASSERT(PREV.rm_offset <= offset); @@ -2030,6 +2032,34 @@ out_error: return error; } +/* Insert a raw rmap into the rmapbt. */ +int +xfs_rmap_map_raw( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rmap) +{ + struct xfs_owner_info oinfo; + + oinfo.oi_owner = rmap->rm_owner; + oinfo.oi_offset = rmap->rm_offset; + oinfo.oi_flags = 0; + if (rmap->rm_flags & XFS_RMAP_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + + if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return xfs_rmap_map(cur, rmap->rm_startblock, + rmap->rm_blockcount, + rmap->rm_flags & XFS_RMAP_UNWRITTEN, + &oinfo); + + return xfs_rmap_map_shared(cur, rmap->rm_startblock, + rmap->rm_blockcount, + rmap->rm_flags & XFS_RMAP_UNWRITTEN, + &oinfo); +} + struct xfs_rmap_query_range_info { xfs_rmap_query_range_fn fn; void *priv; @@ -2453,3 +2483,56 @@ xfs_rmap_record_exists( irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; } + +struct xfs_rmap_key_state { + uint64_t owner; + uint64_t offset; + unsigned int flags; + bool has_rmap; +}; + +/* For each rmap given, figure out if it doesn't match the key we want. */ +STATIC int +xfs_rmap_has_other_keys_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_rmap_key_state *rks = priv; + + if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && + ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) + return 0; + rks->has_rmap = true; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Given an extent and some owner info, can we find records overlapping + * the extent whose owner info does not match the given owner? + */ +int +xfs_rmap_has_other_keys( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + bool *has_rmap) +{ + struct xfs_rmap_irec low = {0}; + struct xfs_rmap_irec high; + struct xfs_rmap_key_state rks; + int error; + + xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); + rks.has_rmap = false; + + low.rm_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno + len - 1; + + error = xfs_rmap_query_range(cur, &low, &high, + xfs_rmap_has_other_keys_helper, &rks); + *has_rmap = rks.has_rmap; + return error; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 380e53be98d5..43e506f67680 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -238,5 +238,9 @@ int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, struct xfs_owner_info *oinfo, bool *has_rmap); +int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, struct xfs_owner_info *oinfo, + bool *has_rmap); +int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 106be2d0bb88..369eeb7a52ec 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -90,6 +90,9 @@ xfs_rtbuf_get( if (error) return error; + if (nmap == 0 || !xfs_bmap_is_real_extent(&map)) + return -EFSCORRUPTED; + ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), @@ -1033,14 +1036,17 @@ xfs_rtalloc_query_range( int is_free; int error = 0; - if (low_rec->ar_startblock > high_rec->ar_startblock) + if (low_rec->ar_startext > high_rec->ar_startext) return -EINVAL; - else if (low_rec->ar_startblock == high_rec->ar_startblock) + if (low_rec->ar_startext >= mp->m_sb.sb_rextents || + low_rec->ar_startext == high_rec->ar_startext) return 0; + if (high_rec->ar_startext >= mp->m_sb.sb_rextents) + high_rec->ar_startext = mp->m_sb.sb_rextents - 1; /* Iterate the bitmap, looking for discrepancies. */ - rtstart = low_rec->ar_startblock; - rem = high_rec->ar_startblock - rtstart; + rtstart = low_rec->ar_startext; + rem = high_rec->ar_startext - rtstart; while (rem) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, @@ -1050,13 +1056,13 @@ xfs_rtalloc_query_range( /* How long does the extent go for? */ error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startblock - 1, &rtend); + high_rec->ar_startext - 1, &rtend); if (error) break; if (is_free) { - rec.ar_startblock = rtstart; - rec.ar_blockcount = rtend - rtstart + 1; + rec.ar_startext = rtstart; + rec.ar_extcount = rtend - rtstart + 1; error = fn(tp, &rec, priv); if (error) @@ -1079,9 +1085,9 @@ xfs_rtalloc_query_all( { struct xfs_rtalloc_rec keys[2]; - keys[0].ar_startblock = 0; - keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks; - keys[0].ar_blockcount = keys[1].ar_blockcount = 0; + keys[0].ar_startext = 0; + keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[0].ar_extcount = keys[1].ar_extcount = 0; return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index d9b94bd5f689..d485e14313c6 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -888,6 +888,109 @@ xfs_sync_sb( return xfs_trans_commit(tp); } +/* + * Update all the secondary superblocks to match the new state of the primary. + * Because we are completely overwriting all the existing fields in the + * secondary superblock buffers, there is no need to read them in from disk. + * Just get a new buffer, stamp it and write it. + * + * The sb buffers need to be cached here so that we serialise against other + * operations that access the secondary superblocks, but we don't want to keep + * them in memory once it is written so we mark it as a one-shot buffer. + */ +int +xfs_update_secondary_sbs( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + int saved_error = 0; + int error = 0; + LIST_HEAD (buffer_list); + + /* update secondary superblocks. */ + for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { + struct xfs_buf *bp; + + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), + XFS_FSS_TO_BB(mp, 1), 0); + /* + * If we get an error reading or writing alternate superblocks, + * continue. xfs_repair chooses the "best" superblock based + * on most matches; if we break early, we'll leave more + * superblocks un-updated than updated, and xfs_repair may + * pick them over the properly-updated primary. + */ + if (!bp) { + xfs_warn(mp, + "error allocating secondary superblock for ag %d", + agno); + if (!saved_error) + saved_error = -ENOMEM; + continue; + } + + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_oneshot(bp); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_buf_delwri_queue(bp, &buffer_list); + xfs_buf_relse(bp); + + /* don't hold too many buffers at once */ + if (agno % 16) + continue; + + error = xfs_buf_delwri_submit(&buffer_list); + if (error) { + xfs_warn(mp, + "write error %d updating a secondary superblock near ag %d", + error, agno); + if (!saved_error) + saved_error = error; + continue; + } + } + error = xfs_buf_delwri_submit(&buffer_list); + if (error) { + xfs_warn(mp, + "write error %d updating a secondary superblock near ag %d", + error, agno); + } + + return saved_error ? saved_error : error; +} + +/* + * Same behavior as xfs_sync_sb, except that it is always synchronous and it + * also writes the superblock buffer to disk sector 0 immediately. + */ +int +xfs_sync_sb_buf( + struct xfs_mount *mp) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); + if (error) + return error; + + xfs_log_sb(tp); + xfs_trans_bhold(tp, mp->m_sb_bp); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + if (error) + goto out; + /* + * write out the sb buffer to get the changes to disk + */ + error = xfs_bwrite(mp->m_sb_bp); +out: + xfs_buf_relse(mp->m_sb_bp); + return error; +} + int xfs_fs_geometry( struct xfs_sb *sbp, @@ -972,3 +1075,47 @@ xfs_fs_geometry( return 0; } + +/* Read a secondary superblock. */ +int +xfs_sb_read_secondary( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp; + int error; + + ASSERT(agno != 0 && agno != NULLAGNUMBER); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + if (error) + return error; + xfs_buf_set_ref(bp, XFS_SSB_REF); + *bpp = bp; + return 0; +} + +/* Get an uninitialised secondary superblock buffer. */ +int +xfs_sb_get_secondary( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp; + + ASSERT(agno != 0 && agno != NULLAGNUMBER); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (!bp) + return -ENOMEM; + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_oneshot(bp); + *bpp = bp; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 63dcd2a1a657..244e0162c49e 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -18,6 +18,13 @@ #ifndef __XFS_SB_H__ #define __XFS_SB_H__ +struct xfs_mount; +struct xfs_sb; +struct xfs_dsb; +struct xfs_trans; +struct xfs_fsop_geom; +struct xfs_perag; + /* * perag get/put wrappers for ref counting */ @@ -29,13 +36,22 @@ extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); +extern int xfs_sync_sb_buf(struct xfs_mount *mp); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); +extern int xfs_update_secondary_sbs(struct xfs_mount *mp); + #define XFS_FS_GEOM_MAX_STRUCT_VER (4) extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, int struct_version); +extern int xfs_sb_read_secondary(struct xfs_mount *mp, + struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **bpp); +extern int xfs_sb_get_secondary(struct xfs_mount *mp, + struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **bpp); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index d0b84da0cb1e..ae99c260adb1 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -57,21 +57,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; -/* - * This structure is used to track log items associated with - * a transaction. It points to the log item and keeps some - * flags to track the state of the log item. It also tracks - * the amount of space needed to log the item it describes - * once we get to commit processing (see xfs_trans_commit()). - */ -struct xfs_log_item_desc { - struct xfs_log_item *lid_item; - struct list_head lid_trans; - unsigned char lid_flags; -}; - -#define XFS_LID_DIRTY 0x1 - /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); @@ -127,6 +112,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_ATTR_BTREE_REF 1 #define XFS_DQUOT_REF 1 #define XFS_REFC_BTREE_REF 1 +#define XFS_SSB_REF 0 /* * Flags for xfs_trans_ichgtime(). diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 3c560695c546..ea18449bd732 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -30,7 +30,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ -typedef int32_t xfs_rtword_t; /* word type for bitmap manipulations */ +typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ typedef int32_t xfs_tid_t; /* transaction identifier */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 018aabbd9394..1f71793f7db4 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -38,68 +38,6 @@ #include "scrub/common.h" #include "scrub/trace.h" -/* - * Walk all the blocks in the AGFL. The fn function can return any negative - * error code or XFS_BTREE_QUERY_RANGE_ABORT. - */ -int -xfs_scrub_walk_agfl( - struct xfs_scrub_context *sc, - int (*fn)(struct xfs_scrub_context *, - xfs_agblock_t bno, void *), - void *priv) -{ - struct xfs_agf *agf; - __be32 *agfl_bno; - struct xfs_mount *mp = sc->mp; - unsigned int flfirst; - unsigned int fllast; - int i; - int error; - - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp); - flfirst = be32_to_cpu(agf->agf_flfirst); - fllast = be32_to_cpu(agf->agf_fllast); - - /* Nothing to walk in an empty AGFL. */ - if (agf->agf_flcount == cpu_to_be32(0)) - return 0; - - /* first to last is a consecutive list. */ - if (fllast >= flfirst) { - for (i = flfirst; i <= fllast; i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - return 0; - } - - /* first to the end */ - for (i = flfirst; i < xfs_agfl_size(mp); i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - /* the start to last. */ - for (i = 0; i <= fllast; i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - return 0; -} - /* Superblock */ /* Cross-reference with the other btrees. */ @@ -157,9 +95,7 @@ xfs_scrub_superblock( if (agno == 0) return 0; - error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp); /* * The superblock verifier can return several different error codes * if it thinks the superblock doesn't look right. For a mount these @@ -680,6 +616,7 @@ struct xfs_scrub_agfl_info { unsigned int sz_entries; unsigned int nr_entries; xfs_agblock_t *entries; + struct xfs_scrub_context *sc; }; /* Cross-reference with the other btrees. */ @@ -701,12 +638,12 @@ xfs_scrub_agfl_block_xref( /* Scrub an AGFL block. */ STATIC int xfs_scrub_agfl_block( - struct xfs_scrub_context *sc, + struct xfs_mount *mp, xfs_agblock_t agbno, void *priv) { - struct xfs_mount *mp = sc->mp; struct xfs_scrub_agfl_info *sai = priv; + struct xfs_scrub_context *sc = sai->sc; xfs_agnumber_t agno = sc->sa.agno; if (xfs_verify_agbno(mp, agno, agbno) && @@ -717,6 +654,9 @@ xfs_scrub_agfl_block( xfs_scrub_agfl_block_xref(sc, agbno, priv); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return XFS_BTREE_QUERY_RANGE_ABORT; + return 0; } @@ -796,8 +736,10 @@ xfs_scrub_agfl( goto out; } memset(&sai, 0, sizeof(sai)); + sai.sc = sc; sai.sz_entries = agflcount; - sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS); + sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, + KM_MAYFAIL); if (!sai.entries) { error = -ENOMEM; goto out; @@ -805,7 +747,12 @@ xfs_scrub_agfl( /* Check the blocks in the AGFL. */ xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG); - error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai); + error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + sc->sa.agfl_bp, xfs_scrub_agfl_block, &sai); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + error = 0; + goto out_free; + } if (error) goto out_free; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c new file mode 100644 index 000000000000..8b91e9ebe1e7 --- /dev/null +++ b/fs/xfs/scrub/agheader_repair.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Superblock */ + +/* Repair the superblock. */ +int +xfs_repair_superblock( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_agnumber_t agno; + int error; + + /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */ + agno = sc->sm->sm_agno; + if (agno == 0) + return -EOPNOTSUPP; + + error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp); + if (error) + return error; + + /* Copy AG 0's superblock to this one. */ + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + + /* Write this to disk. */ + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return error; +} diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 517c079d3f68..941a0a55224e 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -70,7 +70,7 @@ xfs_scrub_allocbt_xref_other( pcur = &sc->sa.cnt_cur; else pcur = &sc->sa.bno_cur; - if (!*pcur) + if (!*pcur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec); @@ -172,7 +172,7 @@ xfs_scrub_xref_is_used_space( bool is_freesp; int error; - if (!sc->sa.bno_cur) + if (!sc->sa.bno_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp); diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 127575f0abfb..84b6d6b66578 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -126,8 +126,9 @@ xfs_scrub_xattr_listent( if (args.valuelen != valuelen) xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - fail_xref: + if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + context->seen_enough = 1; return; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 639d14b51e90..eeadb33a701c 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -51,7 +51,6 @@ xfs_scrub_setup_inode_bmap( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - struct xfs_mount *mp = sc->mp; int error; error = xfs_scrub_get_inode(sc, ip); @@ -75,7 +74,7 @@ xfs_scrub_setup_inode_bmap( } /* Got the inode, lock it and we're ready to go. */ - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, 0); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -175,7 +174,7 @@ xfs_scrub_bmap_xref_rmap( unsigned long long rmap_end; uint64_t owner; - if (!info->sc->sa.rmap_cur) + if (!info->sc->sa.rmap_cur || xfs_scrub_skip_xref(info->sc->sm)) return; if (info->whichfork == XFS_COW_FORK) @@ -684,7 +683,8 @@ xfs_scrub_bmap( info.lastoff = 0; ifp = XFS_IFORK_PTR(ip, whichfork); for_each_xfs_iext(ifp, &icur, &irec) { - if (xfs_scrub_should_terminate(sc, &error)) + if (xfs_scrub_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; if (isnullstartblock(irec.br_startblock)) continue; diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 54218168c8f9..2d29dceaa00e 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -442,7 +442,7 @@ xfs_scrub_btree_check_owner( */ if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { co = kmem_alloc(sizeof(struct check_owner), - KM_MAYFAIL | KM_NOFS); + KM_MAYFAIL); if (!co) return -ENOMEM; co->level = level; @@ -455,6 +455,44 @@ xfs_scrub_btree_check_owner( } /* + * Check that this btree block has at least minrecs records or is one of the + * special blocks that don't require that. + */ +STATIC void +xfs_scrub_btree_check_minrecs( + struct xfs_scrub_btree *bs, + int level, + struct xfs_btree_block *block) +{ + unsigned int numrecs; + int ok_level; + + numrecs = be16_to_cpu(block->bb_numrecs); + + /* More records than minrecs means the block is ok. */ + if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level)) + return; + + /* + * Certain btree blocks /can/ have fewer than minrecs records. Any + * level greater than or equal to the level of the highest dedicated + * btree block are allowed to violate this constraint. + * + * For a btree rooted in a block, the btree root can have fewer than + * minrecs records. If the btree is rooted in an inode and does not + * store records in the root, the direct children of the root and the + * root itself can have fewer than minrecs records. + */ + ok_level = bs->cur->bc_nlevels - 1; + if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + ok_level--; + if (level >= ok_level) + return; + + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); +} + +/* * Grab and scrub a btree block given a btree pointer. Returns block * and buffer pointers (if applicable) if they're ok to use. */ @@ -491,6 +529,8 @@ xfs_scrub_btree_get_block( if (*pbp) xfs_scrub_buffer_recheck(bs->sc, *pbp); + xfs_scrub_btree_check_minrecs(bs, level, *pblock); + /* * Check the block's owner; this function absorbs error codes * for us. diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8ed91d5c868d..41198a5f872c 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -44,11 +44,14 @@ #include "xfs_rmap_btree.h" #include "xfs_log.h" #include "xfs_trans_priv.h" +#include "xfs_attr.h" +#include "xfs_reflink.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/btree.h" +#include "scrub/repair.h" /* Common code for the metadata scrubbers. */ @@ -539,6 +542,10 @@ xfs_scrub_ag_free( xfs_trans_brelse(sc->tp, sa->agi_bp); sa->agi_bp = NULL; } + if (sa->pag) { + xfs_perag_put(sa->pag); + sa->pag = NULL; + } sa->agno = NULLAGNUMBER; } @@ -566,15 +573,53 @@ xfs_scrub_ag_init( return xfs_scrub_ag_btcur_init(sc, sa); } +/* + * Grab the per-ag structure if we haven't already gotten it. Teardown of the + * xfs_scrub_ag will release it for us. + */ +void +xfs_scrub_perag_get( + struct xfs_mount *mp, + struct xfs_scrub_ag *sa) +{ + if (!sa->pag) + sa->pag = xfs_perag_get(mp, sa->agno); +} + /* Per-scrubber setup functions */ +/* + * Grab an empty transaction so that we can re-grab locked buffers if + * one of our btrees turns out to be cyclic. + * + * If we're going to repair something, we need to ask for the largest possible + * log reservation so that we can handle the worst case scenario for metadata + * updates while rebuilding a metadata item. We also need to reserve as many + * blocks in the head transaction as we think we're going to need to rebuild + * the metadata object. + */ +int +xfs_scrub_trans_alloc( + struct xfs_scrub_context *sc, + uint resblks) +{ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, + resblks, 0, 0, &sc->tp); + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + /* Set us up with a transaction and an empty context. */ int xfs_scrub_setup_fs( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp); + uint resblks; + + resblks = xfs_repair_calc_ag_resblks(sc); + return xfs_scrub_trans_alloc(sc, resblks); } /* Set us up with AG headers and btree cursors. */ @@ -695,7 +740,6 @@ xfs_scrub_setup_inode_contents( struct xfs_inode *ip, unsigned int resblks) { - struct xfs_mount *mp = sc->mp; int error; error = xfs_scrub_get_inode(sc, ip); @@ -705,7 +749,7 @@ xfs_scrub_setup_inode_contents( /* Got the inode, lock it and we're ready to go. */ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, resblks); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -727,6 +771,10 @@ xfs_scrub_should_check_xref( int *error, struct xfs_btree_cur **curpp) { + /* No point in xref if we already know we're corrupt. */ + if (xfs_scrub_skip_xref(sc->sm)) + return false; + if (*error == 0) return true; @@ -773,3 +821,80 @@ xfs_scrub_buffer_recheck( sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; trace_xfs_scrub_block_error(sc, bp->b_bn, fa); } + +/* + * Scrub the attr/data forks of a metadata inode. The metadata inode must be + * pointed to by sc->ip and the ILOCK must be held. + */ +int +xfs_scrub_metadata_inode_forks( + struct xfs_scrub_context *sc) +{ + __u32 smtype; + bool shared; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Metadata inodes don't live on the rt device. */ + if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* They should never participate in reflink. */ + if (xfs_is_reflink_inode(sc->ip)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* They also should never have extended attributes. */ + if (xfs_inode_hasattr(sc->ip)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* Invoke the data fork scrubber. */ + smtype = sc->sm->sm_type; + sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD; + error = xfs_scrub_bmap_data(sc); + sc->sm->sm_type = smtype; + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* Look for incorrect shared blocks. */ + if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) { + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &shared); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + if (shared) + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + } + + return error; +} + +/* + * Try to lock an inode in violation of the usual locking order rules. For + * example, trying to get the IOLOCK while in transaction context, or just + * plain breaking AG-order or inode-order inode locking rules. Either way, + * the only way to avoid an ABBA deadlock is to use trylock and back off if + * we can't. + */ +int +xfs_scrub_ilock_inverted( + struct xfs_inode *ip, + uint lock_mode) +{ + int i; + + for (i = 0; i < 20; i++) { + if (xfs_ilock_nowait(ip, lock_mode)) + return 0; + delay(1); + } + return -EDEADLOCK; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index deaf60400981..76bb2d1d808c 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -38,19 +38,7 @@ xfs_scrub_should_terminate( return false; } -/* - * Grab an empty transaction so that we can re-grab locked buffers if - * one of our btrees turns out to be cyclic. - */ -static inline int -xfs_scrub_trans_alloc( - struct xfs_scrub_metadata *sm, - struct xfs_mount *mp, - struct xfs_trans **tpp) -{ - return xfs_trans_alloc_empty(mp, tpp); -} - +int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks); bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork, @@ -135,16 +123,13 @@ xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip) void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno, struct xfs_scrub_ag *sa); +void xfs_scrub_perag_get(struct xfs_mount *mp, struct xfs_scrub_ag *sa); int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno, struct xfs_buf **agi, struct xfs_buf **agf, struct xfs_buf **agfl); void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa); int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); -int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc, - int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno, - void *), - void *priv); int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, struct xfs_owner_info *oinfo, @@ -157,4 +142,17 @@ int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc, struct xfs_inode *ip, unsigned int resblks); void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp); +/* + * Don't bother cross-referencing if we already found corruption or cross + * referencing discrepancies. + */ +static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm) +{ + return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT); +} + +int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc); +int xfs_scrub_ilock_inverted(struct xfs_inode *ip, uint lock_mode); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 38f29806eb54..1a4309b3e786 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -172,7 +172,7 @@ xfs_scrub_dir_actor( error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) - goto fail_xref; + goto out; if (lookup_ino != ino) { xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); goto out; @@ -183,8 +183,13 @@ xfs_scrub_dir_actor( if (error) goto out; out: - return error; -fail_xref: + /* + * A negative error code returned here is supposed to cause the + * dir_emit caller (xfs_readdir) to abort the directory iteration + * and return zero to xfs_scrub_directory. + */ + if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -EFSCORRUPTED; return error; } @@ -240,6 +245,9 @@ xfs_scrub_dir_rec( } xfs_scrub_buffer_recheck(ds->sc, bp); + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_relse; + dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); /* Make sure we got a real directory entry. */ @@ -357,6 +365,9 @@ xfs_scrub_directory_data_bestfree( /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_buf; + /* Do the bestfrees correspond to actual free space? */ bf = d_ops->data_bestfree_p(bp->b_addr); smallest_bestfree = UINT_MAX; @@ -413,14 +424,18 @@ xfs_scrub_directory_data_bestfree( /* Spot check this free entry */ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); - if (tag != ((char *)dup - (char *)bp->b_addr)) + if (tag != ((char *)dup - (char *)bp->b_addr)) { xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } /* * Either this entry is a bestfree or it's smaller than * any of the bestfrees. */ xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_buf; /* Move on. */ newlen = be16_to_cpu(dup->length); @@ -546,6 +561,8 @@ xfs_scrub_directory_leaf1_bestfree( } if (leafhdr.stale != stale) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; /* Check all the bestfree entries. */ for (i = 0; i < bestcount; i++, bestp++) { @@ -556,9 +573,11 @@ xfs_scrub_directory_leaf1_bestfree( i * args->geo->fsbcount, -1, &dbp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - continue; + break; xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; } out: return error; @@ -607,7 +626,7 @@ xfs_scrub_directory_free_bestfree( -1, &dbp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - continue; + break; xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); } @@ -656,7 +675,7 @@ xfs_scrub_directory_blocks( /* Iterate all the data extents in the directory... */ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); - while (found) { + while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { /* Block directories only have a single block at offset 0. */ if (is_block && (got.br_startoff > 0 || @@ -719,7 +738,7 @@ xfs_scrub_directory_blocks( /* Scan for free blocks */ lblk = free_lblk; found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); - while (found) { + while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { /* * Dirs can't have blocks mapped above 2^32. * Single-block dirs shouldn't even be here. diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 106ca4bd753f..00a834d3b56d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -387,7 +387,8 @@ xfs_scrub_iallocbt_xref_rmap_btreeblks( int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || - (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur)) + (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) || + xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many inobt blocks as the rmap says. */ @@ -424,7 +425,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes( xfs_filblks_t blocks; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many inode blocks as the rmap knows about. */ @@ -496,7 +497,7 @@ xfs_scrub_xref_inode_check( bool has_inodes; int error; - if (!(*icur)) + if (!(*icur) || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index df14930e4fc5..0c696f7018de 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -55,7 +55,6 @@ xfs_scrub_setup_inode( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - struct xfs_mount *mp = sc->mp; int error; /* @@ -68,7 +67,7 @@ xfs_scrub_setup_inode( break; case -EFSCORRUPTED: case -EFSBADCRC: - return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + return xfs_scrub_trans_alloc(sc, 0); default: return error; } @@ -76,7 +75,7 @@ xfs_scrub_setup_inode( /* Got the inode, lock it and we're ready to go. */ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, 0); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -449,7 +448,7 @@ xfs_scrub_inode_xref_finobt( int has_record; int error; - if (!sc->sa.fino_cur) + if (!sc->sa.fino_cur || xfs_scrub_skip_xref(sc->sm)) return; agino = XFS_INO_TO_AGINO(sc->mp, ino); @@ -492,6 +491,9 @@ xfs_scrub_inode_xref_bmap( xfs_filblks_t acount; int error; + if (xfs_scrub_skip_xref(sc->sm)) + return; + /* Walk all the extents to check nextents/naextents/nblocks. */ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, &nextents, &count); diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 1fb88c18d455..77c6b22c6bfd 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -147,6 +147,9 @@ xfs_scrub_parent_validate( *try_again = false; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + /* '..' must not point to ourselves. */ if (sc->ip->i_ino == dnum) { xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); @@ -211,7 +214,9 @@ xfs_scrub_parent_validate( */ xfs_iunlock(sc->ip, sc->ilock_flags); sc->ilock_flags = 0; - xfs_ilock(dp, XFS_IOLOCK_SHARED); + error = xfs_scrub_ilock_inverted(dp, XFS_IOLOCK_SHARED); + if (error) + goto out_rele; /* Go looking for our dentry. */ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); @@ -220,8 +225,10 @@ xfs_scrub_parent_validate( /* Drop the parent lock, relock this inode. */ xfs_iunlock(dp, XFS_IOLOCK_SHARED); + error = xfs_scrub_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL); + if (error) + goto out_rele; sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); /* * If we're an unlinked directory, the parent /won't/ have a link @@ -323,5 +330,13 @@ xfs_scrub_parent( if (try_again && tries == 20) xfs_scrub_set_incomplete(sc); out: + /* + * If we failed to lock the parent inode even after a retry, just mark + * this scrub incomplete and return. + */ + if (sc->try_harder && error == -EDEADLOCK) { + error = 0; + xfs_scrub_set_incomplete(sc); + } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 6ba465e6c885..15ae4d23d6ac 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -66,25 +66,43 @@ xfs_scrub_setup_quota( struct xfs_inode *ip) { uint dqtype; + int error; + + if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) + return -ENOENT; dqtype = xfs_scrub_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; + sc->has_quotaofflock = true; + mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + error = xfs_scrub_setup_fs(sc, ip); + if (error) + return error; + sc->ip = xfs_quota_inode(sc->mp, dqtype); + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + sc->ilock_flags = XFS_ILOCK_EXCL; return 0; } /* Quotas. */ +struct xfs_scrub_quota_info { + struct xfs_scrub_context *sc; + xfs_dqid_t last_id; +}; + /* Scrub the fields in an individual quota item. */ -STATIC void +STATIC int xfs_scrub_quota_item( - struct xfs_scrub_context *sc, - uint dqtype, struct xfs_dquot *dq, - xfs_dqid_t id) + uint dqtype, + void *priv) { + struct xfs_scrub_quota_info *sqi = priv; + struct xfs_scrub_context *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_disk_dquot *d = &dq->q_core; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -99,17 +117,18 @@ xfs_scrub_quota_item( unsigned long long icount; unsigned long long rcount; xfs_ino_t fs_icount; - - offset = id / qi->qi_dqperchunk; + xfs_dqid_t id = be32_to_cpu(d->d_id); /* - * We fed $id and DQNEXT into the xfs_qm_dqget call, which means - * that the actual dquot we got must either have the same id or - * the next higher id. + * Except for the root dquot, the actual dquot we got must either have + * the same or higher id as we saw before. */ - if (id > be32_to_cpu(d->d_id)) + offset = id / qi->qi_dqperchunk; + if (id && id <= sqi->last_id) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + sqi->last_id = id; + /* Did we get the dquot type we wanted? */ if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); @@ -183,115 +202,85 @@ xfs_scrub_quota_item( xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); if (id != 0 && rhard != 0 && rcount > rhard) xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + + return 0; } -/* Scrub all of a quota type's items. */ -int -xfs_scrub_quota( +/* Check the quota's data fork. */ +STATIC int +xfs_scrub_quota_data_fork( struct xfs_scrub_context *sc) { struct xfs_bmbt_irec irec = { 0 }; - struct xfs_mount *mp = sc->mp; - struct xfs_inode *ip; - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_dquot *dq; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; xfs_fileoff_t max_dqid_off; - xfs_fileoff_t off = 0; - xfs_dqid_t id = 0; - uint dqtype; - int nimaps; int error = 0; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) - return -ENOENT; - - mutex_lock(&qi->qi_quotaofflock); - dqtype = xfs_scrub_quota_to_dqtype(sc); - if (!xfs_this_quota_on(sc->mp, dqtype)) { - error = -ENOENT; - goto out_unlock_quota; - } - - /* Attach to the quota inode and set sc->ip so that reporting works. */ - ip = xfs_quota_inode(sc->mp, dqtype); - sc->ip = ip; + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; - /* Look for problem extents. */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); - goto out_unlock_inode; - } + /* Check for data fork problems that apply only to quota files. */ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; - while (1) { + ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { if (xfs_scrub_should_terminate(sc, &error)) break; - - off = irec.br_startoff + irec.br_blockcount; - nimaps = 1; - error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps, - XFS_BMAPI_ENTIRE); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off, - &error)) - goto out_unlock_inode; - if (!nimaps) - break; - if (irec.br_startblock == HOLESTARTBLOCK) - continue; - - /* Check the extent record doesn't point to crap. */ - if (irec.br_startblock + irec.br_blockcount <= - irec.br_startblock) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, - irec.br_startoff); - if (!xfs_verify_fsbno(mp, irec.br_startblock) || - !xfs_verify_fsbno(mp, irec.br_startblock + - irec.br_blockcount - 1)) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, - irec.br_startoff); - /* - * Unwritten extents or blocks mapped above the highest + * delalloc extents or blocks mapped above the highest * quota id shouldn't happen. */ if (isnullstartblock(irec.br_startblock) || irec.br_startoff > max_dqid_off || - irec.br_startoff + irec.br_blockcount > max_dqid_off + 1) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + irec.br_startoff); + break; + } } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; - /* Check all the quota items. */ - while (id < ((xfs_dqid_t)-1ULL)) { - if (xfs_scrub_should_terminate(sc, &error)) - break; + return error; +} - error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT, - &dq); - if (error == -ENOENT) - break; - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, - id * qi->qi_dqperchunk, &error)) - break; +/* Scrub all of a quota type's items. */ +int +xfs_scrub_quota( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_quota_info sqi; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + uint dqtype; + int error = 0; - xfs_scrub_quota_item(sc, dqtype, dq, id); + dqtype = xfs_scrub_quota_to_dqtype(sc); - id = be32_to_cpu(dq->q_core.d_id) + 1; - xfs_qm_dqput(dq); - if (!id) - break; - } + /* Look for problem extents. */ + error = xfs_scrub_quota_data_fork(sc); + if (error) + goto out; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* + * Check all the quota items. Now that we've checked the quota inode + * data fork we have to drop ILOCK_EXCL to use the regular dquot + * functions. + */ + xfs_iunlock(sc->ip, sc->ilock_flags); + sc->ilock_flags = 0; + sqi.sc = sc; + sqi.last_id = 0; + error = xfs_qm_dqiterate(mp, dqtype, xfs_scrub_quota_item, &sqi); + sc->ilock_flags = XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, + sqi.last_id * qi->qi_dqperchunk, &error)) + goto out; out: - /* We set sc->ip earlier, so make sure we clear it now. */ - sc->ip = NULL; -out_unlock_quota: - mutex_unlock(&qi->qi_quotaofflock); return error; - -out_unlock_inode: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; } diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 400f1561cd3d..324a5f159145 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -150,7 +150,7 @@ xfs_scrub_refcountbt_rmap_check( * so we don't need insertion sort here. */ frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag), - KM_MAYFAIL | KM_NOFS); + KM_MAYFAIL); if (!frag) return -ENOMEM; memcpy(&frag->rm, rec, sizeof(frag->rm)); @@ -310,7 +310,7 @@ xfs_scrub_refcountbt_xref_rmap( struct xfs_scrub_refcnt_frag *n; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Cross-reference with the rmapbt to confirm the refcount. */ @@ -404,7 +404,7 @@ xfs_scrub_refcount_xref_rmap( xfs_filblks_t blocks; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many refcbt blocks as the rmap knows about. */ @@ -460,7 +460,7 @@ xfs_scrub_xref_is_cow_staging( int has_refcount; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Find the CoW staging extent. */ @@ -504,7 +504,7 @@ xfs_scrub_xref_is_not_shared( bool shared; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c new file mode 100644 index 000000000000..e3e8fba1c99c --- /dev/null +++ b/fs/xfs/scrub/repair.c @@ -0,0 +1,1089 @@ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_extent_busy.h" +#include "xfs_ag_resv.h" +#include "xfs_trans_space.h" +#include "xfs_quota.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Attempt to repair some metadata, if the metadata is corrupt and userspace + * told us to fix it. This function returns -EAGAIN to mean "re-run scrub", + * and will set *fixed to true if it thinks it repaired anything. + */ +int +xfs_repair_attempt( + struct xfs_inode *ip, + struct xfs_scrub_context *sc, + bool *fixed) +{ + int error = 0; + + trace_xfs_repair_attempt(ip, sc->sm, error); + + xfs_scrub_ag_btcur_free(&sc->sa); + + /* Repair whatever's broken. */ + ASSERT(sc->ops->repair); + error = sc->ops->repair(sc); + trace_xfs_repair_done(ip, sc->sm, error); + switch (error) { + case 0: + /* + * Repair succeeded. Commit the fixes and perform a second + * scrub so that we can tell userspace if we fixed the problem. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + *fixed = true; + return -EAGAIN; + case -EDEADLOCK: + case -EAGAIN: + /* Tell the caller to try again having grabbed all the locks. */ + if (!sc->try_harder) { + sc->try_harder = true; + return -EAGAIN; + } + /* + * We tried harder but still couldn't grab all the resources + * we needed to fix it. The corruption has not been fixed, + * so report back to userspace. + */ + return -EFSCORRUPTED; + default: + return error; + } +} + +/* + * Complain about unfixable problems in the filesystem. We don't log + * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver + * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the + * administrator isn't running xfs_scrub in no-repairs mode. + * + * Use this helper function because _ratelimited silently declares a static + * structure to track rate limiting information. + */ +void +xfs_repair_failure( + struct xfs_mount *mp) +{ + xfs_alert_ratelimited(mp, +"Corruption not fixed during online repair. Unmount and run xfs_repair."); +} + +/* + * Repair probe -- userspace uses this to probe if we're willing to repair a + * given mountpoint. + */ +int +xfs_repair_probe( + struct xfs_scrub_context *sc) +{ + int error = 0; + + if (xfs_scrub_should_terminate(sc, &error)) + return error; + + return 0; +} + +/* + * Roll a transaction, keeping the AG headers locked and reinitializing + * the btree cursors. + */ +int +xfs_repair_roll_ag_trans( + struct xfs_scrub_context *sc) +{ + int error; + + /* Keep the AG header buffers locked so we can keep going. */ + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); + + /* Roll the transaction. */ + error = xfs_trans_roll(&sc->tp); + if (error) + goto out_release; + + /* Join AG headers to the new transaction. */ + xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); + + return 0; + +out_release: + /* + * Rolling failed, so release the hold on the buffers. The + * buffers will be released during teardown on our way out + * of the kernel. + */ + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); + xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp); + + return error; +} + +/* + * Does the given AG have enough space to rebuild a btree? Neither AG + * reservation can be critical, and we must have enough space (factoring + * in AG reservations) to construct a whole btree. + */ +bool +xfs_repair_ag_has_space( + struct xfs_perag *pag, + xfs_extlen_t nr_blocks, + enum xfs_ag_resv_type type) +{ + return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) && + !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) && + pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks; +} + +/* + * Figure out how many blocks to reserve for an AG repair. We calculate the + * worst case estimate for the number of blocks we'd need to rebuild one of + * any type of per-AG btree. + */ +xfs_extlen_t +xfs_repair_calc_ag_resblks( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_scrub_metadata *sm = sc->sm; + struct xfs_perag *pag; + struct xfs_buf *bp; + xfs_agino_t icount = 0; + xfs_extlen_t aglen = 0; + xfs_extlen_t usedlen; + xfs_extlen_t freelen; + xfs_extlen_t bnobt_sz; + xfs_extlen_t inobt_sz; + xfs_extlen_t rmapbt_sz; + xfs_extlen_t refcbt_sz; + int error; + + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return 0; + + /* Use in-core counters if possible. */ + pag = xfs_perag_get(mp, sm->sm_agno); + if (pag->pagi_init) + icount = pag->pagi_count; + + /* + * Otherwise try to get the actual counters from disk; if not, make + * some worst case assumptions. + */ + if (icount == 0) { + error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp); + if (error) { + icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock; + } else { + icount = pag->pagi_count; + xfs_buf_relse(bp); + } + } + + /* Now grab the block counters from the AGF. */ + error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); + if (error) { + aglen = mp->m_sb.sb_agblocks; + freelen = aglen; + usedlen = aglen; + } else { + aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length); + freelen = pag->pagf_freeblks; + usedlen = aglen - freelen; + xfs_buf_relse(bp); + } + xfs_perag_put(pag); + + trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, + freelen, usedlen); + + /* + * Figure out how many blocks we'd need worst case to rebuild + * each type of btree. Note that we can only rebuild the + * bnobt/cntbt or inobt/finobt as pairs. + */ + bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen); + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + inobt_sz = xfs_iallocbt_calc_size(mp, icount / + XFS_INODES_PER_HOLEMASK_BIT); + else + inobt_sz = xfs_iallocbt_calc_size(mp, icount / + XFS_INODES_PER_CHUNK); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + inobt_sz *= 2; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen); + else + refcbt_sz = 0; + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + /* + * Guess how many blocks we need to rebuild the rmapbt. + * For non-reflink filesystems we can't have more records than + * used blocks. However, with reflink it's possible to have + * more than one rmap record per AG block. We don't know how + * many rmaps there could be in the AG, so we start off with + * what we hope is an generous over-estimation. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + rmapbt_sz = xfs_rmapbt_calc_size(mp, + (unsigned long long)aglen * 2); + else + rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen); + } else { + rmapbt_sz = 0; + } + + trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, + inobt_sz, rmapbt_sz, refcbt_sz); + + return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); +} + +/* Allocate a block in an AG. */ +int +xfs_repair_alloc_ag_block( + struct xfs_scrub_context *sc, + struct xfs_owner_info *oinfo, + xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv) +{ + struct xfs_alloc_arg args = {0}; + xfs_agblock_t bno; + int error; + + switch (resv) { + case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: + error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1); + if (error) + return error; + if (bno == NULLAGBLOCK) + return -ENOSPC; + xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno, + 1, false); + *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno); + if (resv == XFS_AG_RESV_RMAPBT) + xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno); + return 0; + default: + break; + } + + args.tp = sc->tp; + args.mp = sc->mp; + args.oinfo = *oinfo; + args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0); + args.minlen = 1; + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_THIS_AG; + args.resv = resv; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + ASSERT(args.len == 1); + *fsbno = args.fsbno; + + return 0; +} + +/* Initialize a new AG btree root block with zero entries. */ +int +xfs_repair_init_btblock( + struct xfs_scrub_context *sc, + xfs_fsblock_t fsb, + struct xfs_buf **bpp, + xfs_btnum_t btnum, + const struct xfs_buf_ops *ops) +{ + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + + trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), btnum); + + ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), + XFS_FSB_TO_BB(mp, 1), 0); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(tp, bp, 0, bp->b_length); + bp->b_ops = ops; + *bpp = bp; + + return 0; +} + +/* + * Reconstructing per-AG Btrees + * + * When a space btree is corrupt, we don't bother trying to fix it. Instead, + * we scan secondary space metadata to derive the records that should be in + * the damaged btree, initialize a fresh btree root, and insert the records. + * Note that for rebuilding the rmapbt we scan all the primary data to + * generate the new records. + * + * However, that leaves the matter of removing all the metadata describing the + * old broken structure. For primary metadata we use the rmap data to collect + * every extent with a matching rmap owner (exlist); we then iterate all other + * metadata structures with the same rmap owner to collect the extents that + * cannot be removed (sublist). We then subtract sublist from exlist to + * derive the blocks that were used by the old btree. These blocks can be + * reaped. + * + * For rmapbt reconstructions we must use different tactics for extent + * collection. First we iterate all primary metadata (this excludes the old + * rmapbt, obviously) to generate new rmap records. The gaps in the rmap + * records are collected as exlist. The bnobt records are collected as + * sublist. As with the other btrees we subtract sublist from exlist, and the + * result (since the rmapbt lives in the free space) are the blocks from the + * old rmapbt. + */ + +/* Collect a dead btree extent for later disposal. */ +int +xfs_repair_collect_btree_extent( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + xfs_fsblock_t fsbno, + xfs_extlen_t len) +{ + struct xfs_repair_extent *rex; + + trace_xfs_repair_collect_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, fsbno), + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len); + + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL); + if (!rex) + return -ENOMEM; + + INIT_LIST_HEAD(&rex->list); + rex->fsbno = fsbno; + rex->len = len; + list_add_tail(&rex->list, &exlist->list); + + return 0; +} + +/* + * An error happened during the rebuild so the transaction will be cancelled. + * The fs will shut down, and the administrator has to unmount and run repair. + * Therefore, free all the memory associated with the list so we can die. + */ +void +xfs_repair_cancel_btree_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + + for_each_xfs_repair_extent_safe(rex, n, exlist) { + list_del(&rex->list); + kmem_free(rex); + } +} + +/* Compare two btree extents. */ +static int +xfs_repair_btree_extent_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_repair_extent *ap; + struct xfs_repair_extent *bp; + + ap = container_of(a, struct xfs_repair_extent, list); + bp = container_of(b, struct xfs_repair_extent, list); + + if (ap->fsbno > bp->fsbno) + return 1; + if (ap->fsbno < bp->fsbno) + return -1; + return 0; +} + +/* + * Remove all the blocks mentioned in @sublist from the extents in @exlist. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @exlist; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sublist. This routine subtracts all the extents + * mentioned in sublist from all the extents linked in @exlist, which leaves + * @exlist as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @exlist can be reaped. + */ +#define LEFT_ALIGNED (1 << 0) +#define RIGHT_ALIGNED (1 << 1) +int +xfs_repair_subtract_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_repair_extent_list *sublist) +{ + struct list_head *lp; + struct xfs_repair_extent *ex; + struct xfs_repair_extent *newex; + struct xfs_repair_extent *subex; + xfs_fsblock_t sub_fsb; + xfs_extlen_t sub_len; + int state; + int error = 0; + + if (list_empty(&exlist->list) || list_empty(&sublist->list)) + return 0; + ASSERT(!list_empty(&sublist->list)); + + list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp); + list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp); + + /* + * Now that we've sorted both lists, we iterate exlist once, rolling + * forward through sublist and/or exlist as necessary until we find an + * overlap or reach the end of either list. We do not reset lp to the + * head of exlist nor do we reset subex to the head of sublist. The + * list traversal is similar to merge sort, but we're deleting + * instead. In this manner we avoid O(n^2) operations. + */ + subex = list_first_entry(&sublist->list, struct xfs_repair_extent, + list); + lp = exlist->list.next; + while (lp != &exlist->list) { + ex = list_entry(lp, struct xfs_repair_extent, list); + + /* + * Advance subex and/or ex until we find a pair that + * intersect or we run out of extents. + */ + while (subex->fsbno + subex->len <= ex->fsbno) { + if (list_is_last(&subex->list, &sublist->list)) + goto out; + subex = list_next_entry(subex, list); + } + if (subex->fsbno >= ex->fsbno + ex->len) { + lp = lp->next; + continue; + } + + /* trim subex to fit the extent we have */ + sub_fsb = subex->fsbno; + sub_len = subex->len; + if (subex->fsbno < ex->fsbno) { + sub_len -= ex->fsbno - subex->fsbno; + sub_fsb = ex->fsbno; + } + if (sub_len > ex->len) + sub_len = ex->len; + + state = 0; + if (sub_fsb == ex->fsbno) + state |= LEFT_ALIGNED; + if (sub_fsb + sub_len == ex->fsbno + ex->len) + state |= RIGHT_ALIGNED; + switch (state) { + case LEFT_ALIGNED: + /* Coincides with only the left. */ + ex->fsbno += sub_len; + ex->len -= sub_len; + break; + case RIGHT_ALIGNED: + /* Coincides with only the right. */ + ex->len -= sub_len; + lp = lp->next; + break; + case LEFT_ALIGNED | RIGHT_ALIGNED: + /* Total overlap, just delete ex. */ + lp = lp->next; + list_del(&ex->list); + kmem_free(ex); + break; + case 0: + /* + * Deleting from the middle: add the new right extent + * and then shrink the left extent. + */ + newex = kmem_alloc(sizeof(struct xfs_repair_extent), + KM_MAYFAIL); + if (!newex) { + error = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&newex->list); + newex->fsbno = sub_fsb + sub_len; + newex->len = ex->fsbno + ex->len - newex->fsbno; + list_add(&newex->list, &ex->list); + ex->len = sub_fsb - ex->fsbno; + lp = lp->next; + break; + default: + ASSERT(0); + break; + } + } + +out: + return error; +} +#undef LEFT_ALIGNED +#undef RIGHT_ALIGNED + +/* + * Disposal of Blocks from Old per-AG Btrees + * + * Now that we've constructed a new btree to replace the damaged one, we want + * to dispose of the blocks that (we think) the old btree was using. + * Previously, we used the rmapbt to collect the extents (exlist) with the + * rmap owner corresponding to the tree we rebuilt, collected extents for any + * blocks with the same rmap owner that are owned by another data structure + * (sublist), and subtracted sublist from exlist. In theory the extents + * remaining in exlist are the old btree's blocks. + * + * Unfortunately, it's possible that the btree was crosslinked with other + * blocks on disk. The rmap data can tell us if there are multiple owners, so + * if the rmapbt says there is an owner of this block other than @oinfo, then + * the block is crosslinked. Remove the reverse mapping and continue. + * + * If there is one rmap record, we can free the block, which removes the + * reverse mapping but doesn't add the block to the free space. Our repair + * strategy is to hope the other metadata objects crosslinked on this block + * will be rebuilt (atop different blocks), thereby removing all the cross + * links. + * + * If there are no rmap records at all, we also free the block. If the btree + * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't + * supposed to be a rmap record and everything is ok. For other btrees there + * had to have been an rmap entry for the block to have ended up on @exlist, + * so if it's gone now there's something wrong and the fs will shut down. + * + * Note: If there are multiple rmap records with only the same rmap owner as + * the btree we're trying to rebuild and the block is indeed owned by another + * data structure with the same rmap owner, then the block will be in sublist + * and therefore doesn't need disposal. If there are multiple rmap records + * with only the same rmap owner but the block is not owned by something with + * the same rmap owner, the block will be freed. + * + * The caller is responsible for locking the AG headers for the entire rebuild + * operation so that nothing else can sneak in and change the AG state while + * we're not looking. We also assume that the caller already invalidated any + * buffers associated with @exlist. + */ + +/* + * Invalidate buffers for per-AG btree blocks we're dumping. This function + * is not intended for use with file data repairs; we have bunmapi for that. + */ +int +xfs_repair_invalidate_blocks( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t i; + + /* + * For each block in each extent, see if there's an incore buffer for + * exactly that block; if so, invalidate it. The buffer cache only + * lets us look for one buffer at a time, so we have to look one block + * at a time. Avoid invalidating AG headers and post-EOFS blocks + * because we never own those; and if we can't TRYLOCK the buffer we + * assume it's owned by someone else. + */ + for_each_xfs_repair_extent_safe(rex, n, exlist) { + for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) { + /* Skip AG headers and post-EOFS blocks */ + if (!xfs_verify_fsbno(sc->mp, fsbno)) + continue; + bp = xfs_buf_incore(sc->mp->m_ddev_targp, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK); + if (bp) { + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + } + } + } + + return 0; +} + +/* Ensure the freelist is the correct size. */ +int +xfs_repair_fix_freelist( + struct xfs_scrub_context *sc, + bool can_shrink) +{ + struct xfs_alloc_arg args = {0}; + + args.mp = sc->mp; + args.tp = sc->tp; + args.agno = sc->sa.agno; + args.alignment = 1; + args.pag = sc->sa.pag; + + return xfs_alloc_fix_freelist(&args, + can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); +} + +/* + * Put a block back on the AGFL. + */ +STATIC int +xfs_repair_put_freelist( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno) +{ + struct xfs_owner_info oinfo; + int error; + + /* Make sure there's space on the freelist. */ + error = xfs_repair_fix_freelist(sc, true); + if (error) + return error; + + /* + * Since we're "freeing" a lost block onto the AGFL, we have to + * create an rmap for the block prior to merging it or else other + * parts will break. + */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1, + &oinfo); + if (error) + return error; + + /* Put the block on the AGFL. */ + error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp, + agbno, 0); + if (error) + return error; + xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + + return 0; +} + +/* Dispose of a single metadata block. */ +STATIC int +xfs_repair_dispose_btree_block( + struct xfs_scrub_context *sc, + xfs_fsblock_t fsbno, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type resv) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf_bp = NULL; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + bool has_other_rmap; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + + /* + * If we are repairing per-inode metadata, we need to read in the AGF + * buffer. Otherwise, we're repairing a per-AG structure, so reuse + * the AGF buffer that the setup functions already grabbed. + */ + if (sc->ip) { + error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); + if (error) + return error; + if (!agf_bp) + return -ENOMEM; + } else { + agf_bp = sc->sa.agf_bp; + } + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno); + + /* Can we find any other rmappings? */ + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); + if (error) + goto out_cur; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the reverse mapping and move on. Otherwise, + * we were the only owner of the block, so free the extent, which will + * also remove the rmap. + * + * XXX: XFS doesn't support detecting the case where a single block + * metadata structure is crosslinked with a multi-block structure + * because the buffer cache doesn't detect aliasing problems, so we + * can't fix 100% of crosslinking problems (yet). The verifiers will + * blow on writeout, the filesystem will shut down, and the admin gets + * to run xfs_repair. + */ + if (has_other_rmap) + error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo); + else if (resv == XFS_AG_RESV_AGFL) + error = xfs_repair_put_freelist(sc, agbno); + else + error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv); + if (agf_bp != sc->sa.agf_bp) + xfs_trans_brelse(sc->tp, agf_bp); + if (error) + return error; + + if (sc->ip) + return xfs_trans_roll_inode(&sc->tp, sc->ip); + return xfs_repair_roll_ag_trans(sc); + +out_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + if (agf_bp != sc->sa.agf_bp) + xfs_trans_brelse(sc->tp, agf_bp); + return error; +} + +/* Dispose of btree blocks from an old per-AG btree. */ +int +xfs_repair_reap_btree_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + int error = 0; + + ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); + + /* Dispose of every block from the old btree. */ + for_each_xfs_repair_extent_safe(rex, n, exlist) { + ASSERT(sc->ip != NULL || + XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno); + + trace_xfs_repair_dispose_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, rex->fsbno), + XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len); + + for (; rex->len > 0; rex->len--, rex->fsbno++) { + error = xfs_repair_dispose_btree_block(sc, rex->fsbno, + oinfo, type); + if (error) + goto out; + } + list_del(&rex->list); + kmem_free(rex); + } + +out: + xfs_repair_cancel_btree_extents(sc, exlist); + return error; +} + +/* + * Finding per-AG Btree Roots for AGF/AGI Reconstruction + * + * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild + * the AG headers by using the rmap data to rummage through the AG looking for + * btree roots. This is not guaranteed to work if the AG is heavily damaged + * or the rmap data are corrupt. + * + * Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL + * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the + * AGI is being rebuilt. It must maintain these locks until it's safe for + * other threads to change the btrees' shapes. The caller provides + * information about the btrees to look for by passing in an array of + * xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set. + * The (root, height) fields will be set on return if anything is found. The + * last element of the array should have a NULL buf_ops to mark the end of the + * array. + * + * For every rmapbt record matching any of the rmap owners in btree_info, + * read each block referenced by the rmap record. If the block is a btree + * block from this filesystem matching any of the magic numbers and has a + * level higher than what we've already seen, remember the block and the + * height of the tree required to have such a block. When the call completes, + * we return the highest block we've found for each btree description; those + * should be the roots. + */ + +struct xfs_repair_findroot { + struct xfs_scrub_context *sc; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf; + struct xfs_repair_find_ag_btree *btree_info; +}; + +/* See if our block is in the AGFL. */ +STATIC int +xfs_repair_findroot_agfl_walk( + struct xfs_mount *mp, + xfs_agblock_t bno, + void *priv) +{ + xfs_agblock_t *agbno = priv; + + return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0; +} + +/* Does this block match the btree information passed in? */ +STATIC int +xfs_repair_findroot_block( + struct xfs_repair_findroot *ri, + struct xfs_repair_find_ag_btree *fab, + uint64_t owner, + xfs_agblock_t agbno, + bool *found_it) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_buf *bp; + struct xfs_btree_block *btblock; + xfs_daddr_t daddr; + int error; + + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno); + + /* + * Blocks in the AGFL have stale contents that might just happen to + * have a matching magic and uuid. We don't want to pull these blocks + * in as part of a tree root, so we have to filter out the AGFL stuff + * here. If the AGFL looks insane we'll just refuse to repair. + */ + if (owner == XFS_RMAP_OWN_AG) { + error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, + xfs_repair_findroot_agfl_walk, &agbno); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + return 0; + if (error) + return error; + } + + error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + + /* + * Does this look like a block matching our fs and higher than any + * other block we've found so far? If so, reattach buffer verifiers + * so the AIL won't complain if the buffer is also dirty. + */ + btblock = XFS_BUF_TO_BLOCK(bp); + if (be32_to_cpu(btblock->bb_magic) != fab->magic) + goto out; + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + goto out; + bp->b_ops = fab->buf_ops; + + /* Ignore this block if it's lower in the tree than we've seen. */ + if (fab->root != NULLAGBLOCK && + xfs_btree_get_level(btblock) < fab->height) + goto out; + + /* Make sure we pass the verifiers. */ + bp->b_ops->verify_read(bp); + if (bp->b_error) + goto out; + fab->root = agbno; + fab->height = xfs_btree_get_level(btblock) + 1; + *found_it = true; + + trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno, + be32_to_cpu(btblock->bb_magic), fab->height - 1); +out: + xfs_trans_brelse(ri->sc->tp, bp); + return error; +} + +/* + * Do any of the blocks in this rmap record match one of the btrees we're + * looking for? + */ +STATIC int +xfs_repair_findroot_rmap( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_repair_findroot *ri = priv; + struct xfs_repair_find_ag_btree *fab; + xfs_agblock_t b; + bool found_it; + int error = 0; + + /* Ignore anything that isn't AG metadata. */ + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) + return 0; + + /* Otherwise scan each block + btree type. */ + for (b = 0; b < rec->rm_blockcount; b++) { + found_it = false; + for (fab = ri->btree_info; fab->buf_ops; fab++) { + if (rec->rm_owner != fab->rmap_owner) + continue; + error = xfs_repair_findroot_block(ri, fab, + rec->rm_owner, rec->rm_startblock + b, + &found_it); + if (error) + return error; + if (found_it) + break; + } + } + + return 0; +} + +/* Find the roots of the per-AG btrees described in btree_info. */ +int +xfs_repair_find_ag_btree_roots( + struct xfs_scrub_context *sc, + struct xfs_buf *agf_bp, + struct xfs_repair_find_ag_btree *btree_info, + struct xfs_buf *agfl_bp) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_repair_findroot ri; + struct xfs_repair_find_ag_btree *fab; + struct xfs_btree_cur *cur; + int error; + + ASSERT(xfs_buf_islocked(agf_bp)); + ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp)); + + ri.sc = sc; + ri.btree_info = btree_info; + ri.agf = XFS_BUF_TO_AGF(agf_bp); + ri.agfl_bp = agfl_bp; + for (fab = btree_info; fab->buf_ops; fab++) { + ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); + ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner)); + fab->root = NULLAGBLOCK; + fab->height = 0; + } + + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + return error; +} + +/* Force a quotacheck the next time we mount. */ +void +xfs_repair_force_quotacheck( + struct xfs_scrub_context *sc, + uint dqtype) +{ + uint flag; + + flag = xfs_quota_chkd_flag(dqtype); + if (!(flag & sc->mp->m_qflags)) + return; + + sc->mp->m_qflags &= ~flag; + spin_lock(&sc->mp->m_sb_lock); + sc->mp->m_sb.sb_qflags &= ~flag; + spin_unlock(&sc->mp->m_sb_lock); + xfs_log_sb(sc->tp); +} + +/* + * Attach dquots to this inode, or schedule quotacheck to fix them. + * + * This function ensures that the appropriate dquots are attached to an inode. + * We cannot allow the dquot code to allocate an on-disk dquot block here + * because we're already in transaction context with the inode locked. The + * on-disk dquot should already exist anyway. If the quota code signals + * corruption or missing quota information, schedule quotacheck, which will + * repair corruptions in the quota metadata. + */ +int +xfs_repair_ino_dqattach( + struct xfs_scrub_context *sc) +{ + int error; + + error = xfs_qm_dqattach_locked(sc->ip, false); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + case -ENOENT: + xfs_err_ratelimited(sc->mp, +"inode %llu repair encountered quota error %d, quotacheck forced.", + (unsigned long long)sc->ip->i_ino, error); + if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_USER); + if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ); + /* fall through */ + case -ESRCH: + error = 0; + break; + default: + break; + } + + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h new file mode 100644 index 000000000000..f2b0895294db --- /dev/null +++ b/fs/xfs/scrub/repair.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_REPAIR_H__ +#define __XFS_SCRUB_REPAIR_H__ + +static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc) +{ + return -EOPNOTSUPP; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR + +/* Repair helpers */ + +int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc, + bool *fixed); +void xfs_repair_failure(struct xfs_mount *mp); +int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc); +bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, + enum xfs_ag_resv_type type); +xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc); +int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc, + struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv); +int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb, + struct xfs_buf **bpp, xfs_btnum_t btnum, + const struct xfs_buf_ops *ops); + +struct xfs_repair_extent { + struct list_head list; + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +struct xfs_repair_extent_list { + struct list_head list; +}; + +static inline void +xfs_repair_init_extent_list( + struct xfs_repair_extent_list *exlist) +{ + INIT_LIST_HEAD(&exlist->list); +} + +#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \ + list_for_each_entry_safe((rbe), (n), &(exlist)->list, list) +int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno, + xfs_extlen_t len); +void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist); +int xfs_repair_subtract_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_repair_extent_list *sublist); +int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink); +int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist); +int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + +struct xfs_repair_find_ag_btree { + /* in: rmap owner of the btree we're looking for */ + uint64_t rmap_owner; + + /* in: buffer ops */ + const struct xfs_buf_ops *buf_ops; + + /* in: magic number of the btree */ + uint32_t magic; + + /* out: the highest btree block found and the tree height */ + xfs_agblock_t root; + unsigned int height; +}; + +int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc, + struct xfs_buf *agf_bp, + struct xfs_repair_find_ag_btree *btree_info, + struct xfs_buf *agfl_bp); +void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype); +int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc); + +/* Metadata repairers */ + +int xfs_repair_probe(struct xfs_scrub_context *sc); +int xfs_repair_superblock(struct xfs_scrub_context *sc); + +#else + +static inline int xfs_repair_attempt( + struct xfs_inode *ip, + struct xfs_scrub_context *sc, + bool *fixed) +{ + return -EOPNOTSUPP; +} + +static inline void xfs_repair_failure(struct xfs_mount *mp) {} + +static inline xfs_extlen_t +xfs_repair_calc_ag_resblks( + struct xfs_scrub_context *sc) +{ + ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)); + return 0; +} + +#define xfs_repair_probe xfs_repair_notsupported +#define xfs_repair_superblock xfs_repair_notsupported + +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_REPAIR_H__ */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8f2a7c3ff455..b376a9a77c04 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -66,7 +66,7 @@ xfs_scrub_rmapbt_xref_refc( bool is_unwritten; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); @@ -207,7 +207,7 @@ xfs_scrub_xref_check_owner( bool has_rmap; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo, @@ -250,7 +250,7 @@ xfs_scrub_xref_has_no_owner( bool has_rmap; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap); diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 39c41dfe08ee..40f462a11ea5 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -66,11 +66,15 @@ xfs_scrub_rtbitmap_rec( void *priv) { struct xfs_scrub_context *sc = priv; + xfs_rtblock_t startblock; + xfs_rtblock_t blockcount; - if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock || - !xfs_verify_rtbno(sc->mp, rec->ar_startblock) || - !xfs_verify_rtbno(sc->mp, rec->ar_startblock + - rec->ar_blockcount - 1)) + startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize; + + if (startblock + blockcount <= startblock || + !xfs_verify_rtbno(sc->mp, startblock) || + !xfs_verify_rtbno(sc->mp, startblock + blockcount - 1)) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -82,6 +86,11 @@ xfs_scrub_rtbitmap( { int error; + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; @@ -95,8 +104,35 @@ int xfs_scrub_rtsummary( struct xfs_scrub_context *sc) { + struct xfs_inode *rsumip = sc->mp->m_rsumip; + struct xfs_inode *old_ip = sc->ip; + uint old_ilock_flags = sc->ilock_flags; + int error = 0; + + /* + * We ILOCK'd the rt bitmap ip in the setup routine, now lock the + * rt summary ip in compliance with the rt inode locking rules. + * + * Since we switch sc->ip to rsumip we have to save the old ilock + * flags so that we don't mix up the inode state that @sc tracks. + */ + sc->ip = rsumip; + sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM; + xfs_ilock(sc->ip, sc->ilock_flags); + + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + goto out; + /* XXX: implement this some day */ - return -ENOENT; + xfs_scrub_set_incomplete(sc); +out: + /* Switch back to the rtbitmap inode and lock flags. */ + xfs_iunlock(sc->ip, sc->ilock_flags); + sc->ilock_flags = old_ilock_flags; + sc->ip = old_ip; + return error; } @@ -107,11 +143,23 @@ xfs_scrub_xref_is_used_rt_space( xfs_rtblock_t fsbno, xfs_extlen_t len) { + xfs_rtblock_t startext; + xfs_rtblock_t endext; + xfs_rtblock_t extcount; bool is_free; int error; + if (xfs_scrub_skip_xref(sc->sm)) + return; + + startext = fsbno; + endext = fsbno + len - 1; + do_div(startext, sc->mp->m_sb.sb_rextsize); + if (do_div(endext, sc->mp->m_sb.sb_rextsize)) + endext++; + extcount = endext - startext; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len, + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); if (!xfs_scrub_should_check_xref(sc, &error, NULL)) goto out_unlock; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 26c75967a072..36db098ba583 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -42,11 +42,18 @@ #include "xfs_refcount_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/btree.h" +#include "scrub/repair.h" /* * Online Scrub and Repair @@ -120,6 +127,24 @@ * XCORRUPT flag; btree query function errors are noted by setting the * XFAIL flag and deleting the cursor to prevent further attempts to * cross-reference with a defective btree. + * + * If a piece of metadata proves corrupt or suboptimal, the userspace + * program can ask the kernel to apply some tender loving care (TLC) to + * the metadata object by setting the REPAIR flag and re-calling the + * scrub ioctl. "Corruption" is defined by metadata violating the + * on-disk specification; operations cannot continue if the violation is + * left untreated. It is possible for XFS to continue if an object is + * "suboptimal", however performance may be degraded. Repairs are + * usually performed by rebuilding the metadata entirely out of + * redundant metadata. Optimizing, on the other hand, can sometimes be + * done without rebuilding entire structures. + * + * Generally speaking, the repair code has the following code structure: + * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. + * The first check helps us figure out if we need to rebuild or simply + * optimize the structure so that the rebuild knows what to do. The + * second check evaluates the completeness of the repair; that is what + * is reported to userspace. */ /* @@ -155,7 +180,10 @@ xfs_scrub_teardown( { xfs_scrub_ag_free(sc, &sc->sa); if (sc->tp) { - xfs_trans_cancel(sc->tp); + if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + error = xfs_trans_commit(sc->tp); + else + xfs_trans_cancel(sc->tp); sc->tp = NULL; } if (sc->ip) { @@ -166,6 +194,8 @@ xfs_scrub_teardown( iput(VFS_I(sc->ip)); sc->ip = NULL; } + if (sc->has_quotaofflock) + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; @@ -180,126 +210,150 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { .type = ST_NONE, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_probe, + .repair = xfs_repair_probe, }, [XFS_SCRUB_TYPE_SB] = { /* superblock */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_superblock, + .repair = xfs_repair_superblock, }, [XFS_SCRUB_TYPE_AGF] = { /* agf */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agf, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agfl, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_AGI] = { /* agi */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agi, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_bnobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_cntbt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_inobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_finobt, .has = xfs_sb_version_hasfinobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_rmapbt, .scrub = xfs_scrub_rmapbt, .has = xfs_sb_version_hasrmapbt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_refcountbt, .scrub = xfs_scrub_refcountbt, .has = xfs_sb_version_hasreflink, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, .setup = xfs_scrub_setup_inode, .scrub = xfs_scrub_inode, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_data, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_attr, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_cow, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, .setup = xfs_scrub_setup_directory, .scrub = xfs_scrub_directory, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ .type = ST_INODE, .setup = xfs_scrub_setup_xattr, .scrub = xfs_scrub_xattr, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ .type = ST_INODE, .setup = xfs_scrub_setup_symlink, .scrub = xfs_scrub_symlink, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ .type = ST_INODE, .setup = xfs_scrub_setup_parent, .scrub = xfs_scrub_parent, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtbitmap, .has = xfs_sb_version_hasrealtime, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtsummary, .has = xfs_sb_version_hasrealtime, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, }; @@ -379,15 +433,54 @@ xfs_scrub_validate_inputs( if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) goto out; - /* We don't know how to repair anything yet. */ - if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) - goto out; + /* + * We only want to repair read-write v5+ filesystems. Defer the check + * for ops->repair until after our scrub confirms that we need to + * perform repairs so that we avoid failing due to not supporting + * repairing an object that doesn't need repairs. + */ + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { + error = -EOPNOTSUPP; + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto out; + + error = -EROFS; + if (mp->m_flags & XFS_MOUNT_RDONLY) + goto out; + } error = 0; out: return error; } +#ifdef CONFIG_XFS_ONLINE_REPAIR +static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc) +{ + /* + * Userspace asked us to repair something, we repaired it, rescanned + * it, and the rescan says it's still broken. Scream about this in + * the system logs. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + xfs_repair_failure(sc->mp); +} +#else +static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc) +{ + /* + * Userspace asked us to scrub something, it's broken, and we have no + * way of fixing it. Scream in the logs. + */ + if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT)) + xfs_alert_ratelimited(sc->mp, + "Corruption detected during scrub."); +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + /* Dispatch metadata scrubbing. */ int xfs_scrub_metadata( @@ -397,6 +490,7 @@ xfs_scrub_metadata( struct xfs_scrub_context sc; struct xfs_mount *mp = ip->i_mount; bool try_harder = false; + bool already_fixed = false; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -446,10 +540,44 @@ retry_op: } else if (error) goto out_teardown; - if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | - XFS_SCRUB_OFLAG_XCORRUPT)) - xfs_alert_ratelimited(mp, "Corruption detected during scrub."); + if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) { + bool needs_fix; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + + needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT | + XFS_SCRUB_OFLAG_PREEN)); + /* + * If userspace asked for a repair but it wasn't necessary, + * report that back to userspace. + */ + if (!needs_fix) { + sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; + goto out_nofix; + } + + /* + * If it's broken, userspace wants us to fix it, and we haven't + * already tried to fix it, then attempt a repair. + */ + error = xfs_repair_attempt(ip, &sc, &already_fixed); + if (error == -EAGAIN) { + if (sc.try_harder) + try_harder = true; + error = xfs_scrub_teardown(&sc, ip, 0); + if (error) { + xfs_repair_failure(mp); + goto out; + } + goto retry_op; + } + } +out_nofix: + xfs_scrub_postmortem(&sc); out_teardown: error = xfs_scrub_teardown(&sc, ip, error); out: diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 0d92af86f67a..636424d5e2ee 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -38,6 +38,9 @@ struct xfs_scrub_meta_ops { /* Examine metadata for errors. */ int (*scrub)(struct xfs_scrub_context *); + /* Repair or optimize the metadata. */ + int (*repair)(struct xfs_scrub_context *); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_sb *); @@ -48,6 +51,7 @@ struct xfs_scrub_meta_ops { /* Buffer pointers and btree cursors for an entire AG. */ struct xfs_scrub_ag { xfs_agnumber_t agno; + struct xfs_perag *pag; /* AG btree roots */ struct xfs_buf *agf_bp; @@ -73,6 +77,7 @@ struct xfs_scrub_context { void *buf; uint ilock_flags; bool try_harder; + bool has_quotaofflock; /* State tracking for single-AG operations. */ struct xfs_scrub_ag sa; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 5d2b1c241be5..794d56bb1af8 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -69,6 +69,8 @@ DEFINE_EVENT(xfs_scrub_class, name, \ DEFINE_SCRUB_EVENT(xfs_scrub_start); DEFINE_SCRUB_EVENT(xfs_scrub_done); DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry); +DEFINE_SCRUB_EVENT(xfs_repair_attempt); +DEFINE_SCRUB_EVENT(xfs_repair_done); TRACE_EVENT(xfs_scrub_op_error, TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno, @@ -492,6 +494,262 @@ TRACE_EVENT(xfs_scrub_xref_error, __entry->ret_ip) ); +/* repair tracepoints */ +#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) + +DECLARE_EVENT_CLASS(xfs_repair_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_REPAIR_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_repair_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_dispose_btree_extent); +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_collect_btree_extent); +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_agfl_insert); + +DECLARE_EVENT_CLASS(xfs_repair_rmap_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + uint64_t owner, uint64_t offset, unsigned int flags), + TP_ARGS(mp, agno, agbno, len, owner, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + __entry->offset = offset; + __entry->flags = flags; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#define DEFINE_REPAIR_RMAP_EVENT(name) \ +DEFINE_EVENT(xfs_repair_rmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + uint64_t owner, uint64_t offset, unsigned int flags), \ + TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_alloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_ialloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_rmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_bmap_extent_fn); + +TRACE_EVENT(xfs_repair_refcount_extent_fn, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount) +) + +TRACE_EVENT(xfs_repair_init_btblock, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_btnum_t btnum), + TP_ARGS(mp, agno, agbno, btnum), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(uint32_t, btnum) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->btnum = btnum; + ), + TP_printk("dev %d:%d agno %u agbno %u btnum %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->btnum) +) +TRACE_EVENT(xfs_repair_findroot_block, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + uint32_t magic, uint16_t level), + TP_ARGS(mp, agno, agbno, magic, level), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(uint32_t, magic) + __field(uint16_t, level) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->magic = magic; + __entry->level = level; + ), + TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->magic, + __entry->level) +) +TRACE_EVENT(xfs_repair_calc_ag_resblks, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen, + xfs_agblock_t usedlen), + TP_ARGS(mp, agno, icount, aglen, freelen, usedlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, icount) + __field(xfs_agblock_t, aglen) + __field(xfs_agblock_t, freelen) + __field(xfs_agblock_t, usedlen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->icount = icount; + __entry->aglen = aglen; + __entry->freelen = freelen; + __entry->usedlen = usedlen; + ), + TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->icount, + __entry->aglen, + __entry->freelen, + __entry->usedlen) +) +TRACE_EVENT(xfs_repair_calc_ag_resblks_btsize, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz, + xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz), + TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bnobt_sz) + __field(xfs_agblock_t, inobt_sz) + __field(xfs_agblock_t, rmapbt_sz) + __field(xfs_agblock_t, refcbt_sz) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bnobt_sz = bnobt_sz; + __entry->inobt_sz = inobt_sz; + __entry->rmapbt_sz = rmapbt_sz; + __entry->refcbt_sz = refcbt_sz; + ), + TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bnobt_sz, + __entry->inobt_sz, + __entry->rmapbt_sz, + __entry->refcbt_sz) +) +TRACE_EVENT(xfs_repair_reset_counters, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + ), + TP_printk("dev %d:%d", + MAJOR(__entry->dev), MINOR(__entry->dev)) +) + +TRACE_EVENT(xfs_repair_ialloc_insert, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, uint16_t holemask, uint8_t count, + uint8_t freecount, uint64_t freemask), + TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->holemask = holemask; + __entry->count = count; + __entry->freecount = freecount; + __entry->freemask = freemask; + ), + TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + +#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ + #endif /* _TRACE_XFS_SCRUB_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 102463543db3..ca6903726689 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1378,10 +1378,9 @@ xfs_vm_bmap( struct address_space *mapping, sector_t block) { - struct inode *inode = (struct inode *)mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(mapping->host); - trace_xfs_vm_bmap(XFS_I(inode)); + trace_xfs_vm_bmap(ip); /* * The swap code (ab-)uses ->bmap to get a block mapping and then @@ -1394,9 +1393,7 @@ xfs_vm_bmap( */ if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; - - filemap_write_and_wait(mapping); - return generic_block_bmap(mapping, block, xfs_get_blocks); + return iomap_bmap(mapping, block, &xfs_iomap_ops); } STATIC int @@ -1475,6 +1472,16 @@ xfs_vm_set_page_dirty( return newly_dirty; } +static int +xfs_iomap_swapfile_activate( + struct swap_info_struct *sis, + struct file *swap_file, + sector_t *span) +{ + sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file)); + return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops); +} + const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, @@ -1488,6 +1495,7 @@ const struct address_space_operations xfs_address_space_operations = { .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, + .swap_activate = xfs_iomap_swapfile_activate, }; const struct address_space_operations xfs_dax_aops = { @@ -1495,4 +1503,5 @@ const struct address_space_operations xfs_dax_aops = { .direct_IO = noop_direct_IO, .set_page_dirty = noop_set_page_dirty, .invalidatepage = noop_invalidatepage, + .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 2203465e63ea..618bb71535c8 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -160,7 +160,7 @@ STATIC void xfs_bui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_bui_release(BUI_ITEM(lip)); } @@ -305,7 +305,7 @@ xfs_bud_item_unlock( { struct xfs_bud_log_item *budp = BUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_bui_release(budp->bud_buip); kmem_zone_free(xfs_bud_zone, budp); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 8cd8c412f52d..06badcbadeb4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -848,7 +848,7 @@ xfs_free_eofblocks( /* * Attach the dquots to the inode up front. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -871,8 +871,8 @@ xfs_free_eofblocks( * contents of the file are flushed to disk then the files * may be full of holes (ie NULL files bug). */ - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, - XFS_ISIZE(ip)); + error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip), XFS_BMAPI_NODISCARD); if (error) { /* * If we get an error at this point we simply don't @@ -918,7 +918,7 @@ xfs_alloc_file_space( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -1169,7 +1169,7 @@ xfs_free_file_space( trace_xfs_free_file_space(ip); - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 55661cbdb51b..5179ab9e3d6a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -549,17 +549,31 @@ xfs_buf_hash_destroy( } /* - * Look up, and creates if absent, a lockable buffer for - * a given range of an inode. The buffer is returned - * locked. No I/O is implied by this call. + * Look up a buffer in the buffer cache and return it referenced and locked + * in @found_bp. + * + * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the + * cache. + * + * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return + * -EAGAIN if we fail to lock it. + * + * Return values are: + * -EFSCORRUPTED if have been supplied with an invalid address + * -EAGAIN on trylock failure + * -ENOENT if we fail to find a match and @new_bp was NULL + * 0, with @found_bp: + * - @new_bp if we inserted it into the cache + * - the buffer we found and locked. */ -xfs_buf_t * -_xfs_buf_find( +static int +xfs_buf_find( struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - xfs_buf_t *new_bp) + struct xfs_buf *new_bp, + struct xfs_buf **found_bp) { struct xfs_perag *pag; xfs_buf_t *bp; @@ -567,6 +581,8 @@ _xfs_buf_find( xfs_daddr_t eofs; int i; + *found_bp = NULL; + for (i = 0; i < nmaps; i++) cmap.bm_len += map[i].bm_len; @@ -580,16 +596,11 @@ _xfs_buf_find( */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { - /* - * XXX (dgc): we should really be returning -EFSCORRUPTED here, - * but none of the higher level infrastructure supports - * returning a specific error on buffer lookup failures. - */ xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", __func__, cmap.bm_bn, eofs); WARN_ON(1); - return NULL; + return -EFSCORRUPTED; } pag = xfs_perag_get(btp->bt_mount, @@ -604,19 +615,20 @@ _xfs_buf_find( } /* No match found */ - if (new_bp) { - /* the buffer keeps the perag reference until it is freed */ - new_bp->b_pag = pag; - rhashtable_insert_fast(&pag->pag_buf_hash, - &new_bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - } else { + if (!new_bp) { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); + return -ENOENT; } - return new_bp; + + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, + xfs_buf_hash_params); + spin_unlock(&pag->pag_buf_lock); + *found_bp = new_bp; + return 0; found: spin_unlock(&pag->pag_buf_lock); @@ -626,7 +638,7 @@ found: if (flags & XBF_TRYLOCK) { xfs_buf_rele(bp); XFS_STATS_INC(btp->bt_mount, xb_busy_locked); - return NULL; + return -EAGAIN; } xfs_buf_lock(bp); XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); @@ -646,6 +658,24 @@ found: trace_xfs_buf_find(bp, flags, _RET_IP_); XFS_STATS_INC(btp->bt_mount, xb_get_locked); + *found_bp = bp; + return 0; +} + +struct xfs_buf * +xfs_buf_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + int error; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + + error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); + if (error) + return NULL; return bp; } @@ -665,9 +695,27 @@ xfs_buf_get_map( struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, map, nmaps, flags, NULL); - if (likely(bp)) + error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); + + switch (error) { + case 0: + /* cache hit */ goto found; + case -EAGAIN: + /* cache hit, trylock failure, caller handles failure */ + ASSERT(flags & XBF_TRYLOCK); + return NULL; + case -ENOENT: + /* cache miss, go for insert */ + break; + case -EFSCORRUPTED: + default: + /* + * None of the higher layers understand failure types + * yet, so return NULL to signal a fatal lookup error. + */ + return NULL; + } new_bp = _xfs_buf_alloc(target, map, nmaps, flags); if (unlikely(!new_bp)) @@ -679,8 +727,8 @@ xfs_buf_get_map( return NULL; } - bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); - if (!bp) { + error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); + if (error) { xfs_buf_free(new_bp); return NULL; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index edced162a674..f5f2b71c2fde 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -218,20 +218,9 @@ typedef struct xfs_buf { } xfs_buf_t; /* Finding and Reading Buffers */ -struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, struct xfs_buf *new_bp); - -static inline struct xfs_buf * -xfs_incore( - struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) -{ - DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return _xfs_buf_find(target, &map, 1, flags, NULL); -} +struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, + xfs_daddr_t blkno, size_t numblks, + xfs_buf_flags_t flags); struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -358,6 +347,18 @@ extern void xfs_buf_terminate(void); void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); +/* + * If the buffer is already on the LRU, do nothing. Otherwise set the buffer + * up with a reference count of 0 so it will be tossed from the cache when + * released. + */ +static inline void xfs_buf_oneshot(struct xfs_buf *bp) +{ + if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1) + return; + atomic_set(&bp->b_lru_ref, 0); +} + static inline int xfs_buf_ispinned(struct xfs_buf *bp) { return atomic_read(&bp->b_pin_count); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 82ad270e390e..c2311379d1c3 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -438,7 +438,7 @@ xfs_buf_item_unpin( * xfs_trans_uncommit() will try to reference the * buffer which we no longer have a hold on. */ - if (lip->li_desc) + if (!list_empty(&lip->li_trans)) xfs_trans_del_item(lip); /* @@ -568,13 +568,15 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); + bool aborted; bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); #if defined(DEBUG) || defined(XFS_WARN) bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); #endif + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); + /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; @@ -743,8 +745,10 @@ xfs_buf_item_init( * nothing to do here so return. */ ASSERT(bp->b_target->bt_mount == mp); - if (bip != NULL) { + if (bip) { ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(!bp->b_transp); + ASSERT(bip->bli_buf == bp); return 0; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a7daef9e16bf..2567391489bd 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -288,49 +288,43 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) } /* - * Allocate a block and fill it with dquots. - * This is called when the bmapi finds a hole. + * Ensure that the given in-core dquot has a buffer on disk backing it, and + * return the buffer. This is called when the bmapi finds a hole. */ STATIC int -xfs_qm_dqalloc( - xfs_trans_t **tpp, - xfs_mount_t *mp, - xfs_dquot_t *dqp, - xfs_inode_t *quotip, - xfs_fileoff_t offset_fsb, - xfs_buf_t **O_bpp) +xfs_dquot_disk_alloc( + struct xfs_trans **tpp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { - xfs_fsblock_t firstblock; - struct xfs_defer_ops dfops; - xfs_bmbt_irec_t map; - int nmaps, error; - xfs_buf_t *bp; - xfs_trans_t *tp = *tpp; - - ASSERT(tp != NULL); + struct xfs_bmbt_irec map; + struct xfs_defer_ops dfops; + struct xfs_mount *mp = (*tpp)->t_mountp; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + xfs_fsblock_t firstblock; + int nmaps = 1; + int error; trace_xfs_dqalloc(dqp); - /* - * Initialize the bmap freelist prior to calling bmapi code. - */ xfs_defer_init(&dfops, &firstblock); xfs_ilock(quotip, XFS_ILOCK_EXCL); - /* - * Return if this type of quotas is turned off while we didn't - * have an inode lock - */ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ xfs_iunlock(quotip, XFS_ILOCK_EXCL); return -ESRCH; } - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); - nmaps = 1; - error = xfs_bmapi_write(tp, quotip, offset_fsb, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &dfops); + /* Create the block mapping. */ + xfs_trans_ijoin(*tpp, quotip, XFS_ILOCK_EXCL); + error = xfs_bmapi_write(*tpp, quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &dfops); if (error) goto error0; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -344,10 +338,8 @@ xfs_qm_dqalloc( dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, - 0); + bp = xfs_trans_get_buf(*tpp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { error = -ENOMEM; goto error1; @@ -358,37 +350,45 @@ xfs_qm_dqalloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), + xfs_qm_init_dquot_blk(*tpp, mp, be32_to_cpu(dqp->q_core.d_id), dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* - * xfs_defer_finish() may commit the current transaction and - * start a second transaction if the freelist is not empty. + * Hold the buffer and join it to the dfops so that we'll still own + * the buffer when we return to the caller. The buffer disposal on + * error must be paid attention to very carefully, as it has been + * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota + * code when allocating a new dquot record" in 2005, and the later + * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep + * the buffer locked across the _defer_finish call. We can now do + * this correctly with xfs_defer_bjoin. * - * Since we still want to modify this buffer, we need to - * ensure that the buffer is not released on commit of - * the first transaction and ensure the buffer is added to the - * second transaction. + * Above, we allocated a disk block for the dquot information and + * used get_buf to initialize the dquot. If the _defer_bjoin fails, + * the buffer is still locked to *tpp, so we must _bhold_release and + * then _trans_brelse the buffer. If the _defer_finish fails, the old + * transaction is gone but the new buffer is not joined or held to any + * transaction, so we must _buf_relse it. * - * If there is only one transaction then don't stop the buffer - * from being released when it commits later on. + * If everything succeeds, the caller of this function is returned a + * buffer that is locked and held to the transaction. The caller + * is responsible for unlocking any buffer passed back, either + * manually or by committing the transaction. */ - - xfs_trans_bhold(tp, bp); - + xfs_trans_bhold(*tpp, bp); + error = xfs_defer_bjoin(&dfops, bp); + if (error) { + xfs_trans_bhold_release(*tpp, bp); + xfs_trans_brelse(*tpp, bp); + goto error1; + } error = xfs_defer_finish(tpp, &dfops); - if (error) + if (error) { + xfs_buf_relse(bp); goto error1; - - /* Transaction was committed? */ - if (*tpp != tp) { - tp = *tpp; - xfs_trans_bjoin(tp, bp); - } else { - xfs_trans_bhold_release(tp, bp); } - - *O_bpp = bp; + *bpp = bp; return 0; error1: @@ -398,32 +398,24 @@ error0: } /* - * Maps a dquot to the buffer containing its on-disk version. - * This returns a ptr to the buffer containing the on-disk dquot - * in the bpp param, and a ptr to the on-disk dquot within that buffer + * Read in the in-core dquot's on-disk metadata and return the buffer. + * Returns ENOENT to signal a hole. */ STATIC int -xfs_qm_dqtobp( - xfs_trans_t **tpp, - xfs_dquot_t *dqp, - xfs_disk_dquot_t **O_ddpp, - xfs_buf_t **O_bpp, - uint flags) +xfs_dquot_disk_read( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { struct xfs_bmbt_irec map; - int nmaps = 1, error; struct xfs_buf *bp; - struct xfs_inode *quotip; - struct xfs_mount *mp = dqp->q_mount; - xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); - struct xfs_trans *tp = (tpp ? *tpp : NULL); + struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); uint lock_mode; - - quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags); - dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + int nmaps = 1; + int error; lock_mode = xfs_ilock_data_map_shared(quotip); - if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + if (!xfs_this_quota_on(mp, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. @@ -436,81 +428,48 @@ xfs_qm_dqtobp( * Find the block map; no allocations yet */ error = xfs_bmapi_read(quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, lock_mode); if (error) return error; ASSERT(nmaps == 1); - ASSERT(map.br_blockcount == 1); + ASSERT(map.br_blockcount >= 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (map.br_startblock == HOLESTARTBLOCK) + return -ENOENT; + + trace_xfs_dqtobp_read(dqp); /* - * Offset of dquot in the (fixed sized) dquot chunk. + * store the blkno etc so that we don't have to do the + * mapping all the time */ - dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * - sizeof(xfs_dqblk_t); - - ASSERT(map.br_startblock != DELAYSTARTBLOCK); - if (map.br_startblock == HOLESTARTBLOCK) { - /* - * We don't allocate unless we're asked to - */ - if (!(flags & XFS_QMOPT_DQALLOC)) - return -ENOENT; - - ASSERT(tp); - error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, - dqp->q_fileoffset, &bp); - if (error) - return error; - tp = *tpp; - } else { - trace_xfs_dqtobp_read(dqp); + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - /* - * store the blkno etc so that we don't have to do the - * mapping all the time - */ - dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, - 0, &bp, &xfs_dquot_buf_ops); - if (error) { - ASSERT(bp == NULL); - return error; - } + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); + if (error) { + ASSERT(bp == NULL); + return error; } ASSERT(xfs_buf_islocked(bp)); - *O_bpp = bp; - *O_ddpp = bp->b_addr + dqp->q_bufoffset; + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + *bpp = bp; return 0; } - -/* - * Read in the ondisk dquot using dqtobp() then copy it to an incore version, - * and release the buffer immediately. - * - * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. - */ -int -xfs_qm_dqread( +/* Allocate and initialize everything we need for an incore dquot. */ +STATIC struct xfs_dquot * +xfs_dquot_alloc( struct xfs_mount *mp, xfs_dqid_t id, - uint type, - uint flags, - struct xfs_dquot **O_dqpp) + uint type) { struct xfs_dquot *dqp; - struct xfs_disk_dquot *ddqp; - struct xfs_buf *bp; - struct xfs_trans *tp = NULL; - int error; dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); @@ -520,6 +479,12 @@ xfs_qm_dqread( INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); + dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + /* + * Offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * + sizeof(xfs_dqblk_t); /* * Because we want to use a counting completion, complete @@ -548,35 +513,22 @@ xfs_qm_dqread( break; } - XFS_STATS_INC(mp, xs_qm_dquot); - - trace_xfs_dqread(dqp); + xfs_qm_dquot_logitem_init(dqp); - if (flags & XFS_QMOPT_DQALLOC) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); - if (error) - goto error0; - } + XFS_STATS_INC(mp, xs_qm_dquot); + return dqp; +} - /* - * get a pointer to the on-disk dquot and the buffer containing it - * dqp already knows its own type (GROUP/USER). - */ - error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); - if (error) { - /* - * This can happen if quotas got turned off (ESRCH), - * or if the dquot didn't exist on disk and we ask to - * allocate (ENOENT). - */ - trace_xfs_dqread_fail(dqp); - goto error1; - } +/* Copy the in-core quota fields in from the on-disk buffer. */ +STATIC void +xfs_dquot_from_disk( + struct xfs_dquot *dqp, + struct xfs_buf *bp) +{ + struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); - xfs_qm_dquot_logitem_init(dqp); /* * Reservation counters are defined as reservation plus current usage @@ -588,40 +540,90 @@ xfs_qm_dqread( /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); +} - /* Mark the buf so that this will stay incore a little longer */ - xfs_buf_set_ref(bp, XFS_DQUOT_REF); +/* Allocate and initialize the dquot buffer for this in-core dquot. */ +static int +xfs_qm_dqread_alloc( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_trans *tp; + struct xfs_buf *bp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); + if (error) + goto err; + + error = xfs_dquot_disk_alloc(&tp, dqp, &bp); + if (error) + goto err_cancel; + + error = xfs_trans_commit(tp); + if (error) { + /* + * Buffer was held to the transaction, so we have to unlock it + * manually here because we're not passing it back. + */ + xfs_buf_relse(bp); + goto err; + } + *bpp = bp; + return 0; + +err_cancel: + xfs_trans_cancel(tp); +err: + return error; +} + +/* + * Read in the ondisk dquot using dqtobp() then copy it to an incore version, + * and release the buffer immediately. If @can_alloc is true, fill any + * holes in the on-disk metadata. + */ +static int +xfs_qm_dqread( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + bool can_alloc, + struct xfs_dquot **dqpp) +{ + struct xfs_dquot *dqp; + struct xfs_buf *bp; + int error; + + dqp = xfs_dquot_alloc(mp, id, type); + trace_xfs_dqread(dqp); + + /* Try to read the buffer, allocating if necessary. */ + error = xfs_dquot_disk_read(mp, dqp, &bp); + if (error == -ENOENT && can_alloc) + error = xfs_qm_dqread_alloc(mp, dqp, &bp); + if (error) + goto err; /* - * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) - * So we need to release with xfs_trans_brelse(). - * The strategy here is identical to that of inodes; we lock - * the dquot in xfs_qm_dqget() before making it accessible to - * others. This is because dquots, like inodes, need a good level of - * concurrency, and we don't want to take locks on the entire buffers - * for dquot accesses. - * Note also that the dquot buffer may even be dirty at this point, if - * this particular dquot was repaired. We still aren't afraid to - * brelse it because we have the changes incore. + * At this point we should have a clean locked buffer. Copy the data + * to the incore dquot and release the buffer since the incore dquot + * has its own locking protocol so we needn't tie up the buffer any + * further. */ ASSERT(xfs_buf_islocked(bp)); - xfs_trans_brelse(tp, bp); + xfs_dquot_from_disk(dqp, bp); - if (tp) { - error = xfs_trans_commit(tp); - if (error) - goto error0; - } - - *O_dqpp = dqp; + xfs_buf_relse(bp); + *dqpp = dqp; return error; -error1: - if (tp) - xfs_trans_cancel(tp); -error0: +err: + trace_xfs_dqread_fail(dqp); xfs_qm_dqdestroy(dqp); - *O_dqpp = NULL; + *dqpp = NULL; return error; } @@ -679,77 +681,230 @@ xfs_dq_get_next_id( } /* - * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a - * a locked dquot, doing an allocation (if requested) as needed. - * When both an inode and an id are given, the inode's id takes precedence. - * That is, if the id changes while we don't hold the ilock inside this - * function, the new dquot is returned, not necessarily the one requested - * in the id argument. + * Look up the dquot in the in-core cache. If found, the dquot is returned + * locked and ready to go. + */ +static struct xfs_dquot * +xfs_qm_dqget_cache_lookup( + struct xfs_mount *mp, + struct xfs_quotainfo *qi, + struct radix_tree_root *tree, + xfs_dqid_t id) +{ + struct xfs_dquot *dqp; + +restart: + mutex_lock(&qi->qi_tree_lock); + dqp = radix_tree_lookup(tree, id); + if (!dqp) { + mutex_unlock(&qi->qi_tree_lock); + XFS_STATS_INC(mp, xs_qm_dqcachemisses); + return NULL; + } + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_freeing(dqp); + delay(1); + goto restart; + } + + dqp->q_nrefs++; + mutex_unlock(&qi->qi_tree_lock); + + trace_xfs_dqget_hit(dqp); + XFS_STATS_INC(mp, xs_qm_dqcachehits); + return dqp; +} + +/* + * Try to insert a new dquot into the in-core cache. If an error occurs the + * caller should throw away the dquot and start over. Otherwise, the dquot + * is returned locked (and held by the cache) as if there had been a cache + * hit. + */ +static int +xfs_qm_dqget_cache_insert( + struct xfs_mount *mp, + struct xfs_quotainfo *qi, + struct radix_tree_root *tree, + xfs_dqid_t id, + struct xfs_dquot *dqp) +{ + int error; + + mutex_lock(&qi->qi_tree_lock); + error = radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + /* Duplicate found! Caller must try again. */ + WARN_ON(error != -EEXIST); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + return error; + } + + /* Return a locked dquot to the caller, with a reference taken. */ + xfs_dqlock(dqp); + dqp->q_nrefs = 1; + + qi->qi_dquots++; + mutex_unlock(&qi->qi_tree_lock); + + return 0; +} + +/* Check our input parameters. */ +static int +xfs_qm_dqget_checks( + struct xfs_mount *mp, + uint type) +{ + if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) + return -ESRCH; + + switch (type) { + case XFS_DQ_USER: + if (!XFS_IS_UQUOTA_ON(mp)) + return -ESRCH; + return 0; + case XFS_DQ_GROUP: + if (!XFS_IS_GQUOTA_ON(mp)) + return -ESRCH; + return 0; + case XFS_DQ_PROJ: + if (!XFS_IS_PQUOTA_ON(mp)) + return -ESRCH; + return 0; + default: + WARN_ON_ONCE(0); + return -EINVAL; + } +} + +/* + * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked + * dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( - xfs_mount_t *mp, - xfs_inode_t *ip, /* locked inode (optional) */ - xfs_dqid_t id, /* uid/projid/gid depending on type */ - uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ - uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ - xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + bool can_alloc, + struct xfs_dquot **O_dqpp) { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || - (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || - (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { - return -ESRCH; + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; + +restart: + dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); + if (dqp) { + *O_dqpp = dqp; + return 0; } - ASSERT(type == XFS_DQ_USER || - type == XFS_DQ_PROJ || - type == XFS_DQ_GROUP); - if (ip) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_inode_dquot(ip, type) == NULL); + error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); + if (error) + return error; + + error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + if (error) { + /* + * Duplicate found. Just throw away the new dquot and start + * over. + */ + xfs_qm_dqdestroy(dqp); + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; } -restart: - mutex_lock(&qi->qi_tree_lock); - dqp = radix_tree_lookup(tree, id); - if (dqp) { - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { - xfs_dqunlock(dqp); - mutex_unlock(&qi->qi_tree_lock); - trace_xfs_dqget_freeing(dqp); - delay(1); - goto restart; - } + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return 0; +} - /* uninit / unused quota found in radix tree, keep looking */ - if (flags & XFS_QMOPT_DQNEXT) { - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_dqunlock(dqp); - mutex_unlock(&qi->qi_tree_lock); - error = xfs_dq_get_next_id(mp, type, &id); - if (error) - return error; - goto restart; - } - } +/* + * Given a dquot id and type, read and initialize a dquot from the on-disk + * metadata. This function is only for use during quota initialization so + * it ignores the dquot cache assuming that the dquot shrinker isn't set up. + * The caller is responsible for _qm_dqdestroy'ing the returned dquot. + */ +int +xfs_qm_dqget_uncached( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_dquot **dqpp) +{ + int error; - dqp->q_nrefs++; - mutex_unlock(&qi->qi_tree_lock); + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; + + return xfs_qm_dqread(mp, id, type, 0, dqpp); +} + +/* Return the quota id for a given inode and type. */ +xfs_dqid_t +xfs_qm_id_for_quotatype( + struct xfs_inode *ip, + uint type) +{ + switch (type) { + case XFS_DQ_USER: + return ip->i_d.di_uid; + case XFS_DQ_GROUP: + return ip->i_d.di_gid; + case XFS_DQ_PROJ: + return xfs_get_projid(ip); + } + ASSERT(0); + return 0; +} + +/* + * Return the dquot for a given inode and type. If @can_alloc is true, then + * allocate blocks if needed. The inode's ILOCK must be held and it must not + * have already had an inode attached. + */ +int +xfs_qm_dqget_inode( + struct xfs_inode *ip, + uint type, + bool can_alloc, + struct xfs_dquot **O_dqpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct xfs_dquot *dqp; + xfs_dqid_t id; + int error; - trace_xfs_dqget_hit(dqp); - XFS_STATS_INC(mp, xs_qm_dqcachehits); + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_inode_dquot(ip, type) == NULL); + + id = xfs_qm_id_for_quotatype(ip, type); + +restart: + dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); + if (dqp) { *O_dqpp = dqp; return 0; } - mutex_unlock(&qi->qi_tree_lock); - XFS_STATS_INC(mp, xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -758,87 +913,81 @@ restart: * lock here means dealing with a chown that can happen before * we re-acquire the lock. */ - if (ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - error = xfs_qm_dqread(mp, id, type, flags, &dqp); - - if (ip) - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* If we are asked to find next active id, keep looking */ - if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) { - error = xfs_dq_get_next_id(mp, type, &id); - if (!error) - goto restart; - } - + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) return error; - if (ip) { - /* - * A dquot could be attached to this inode by now, since - * we had dropped the ilock. - */ - if (xfs_this_quota_on(mp, type)) { - struct xfs_dquot *dqp1; - - dqp1 = xfs_inode_dquot(ip, type); - if (dqp1) { - xfs_qm_dqdestroy(dqp); - dqp = dqp1; - xfs_dqlock(dqp); - goto dqret; - } - } else { - /* inode stays locked on return */ + /* + * A dquot could be attached to this inode by now, since we had + * dropped the ilock. + */ + if (xfs_this_quota_on(mp, type)) { + struct xfs_dquot *dqp1; + + dqp1 = xfs_inode_dquot(ip, type); + if (dqp1) { xfs_qm_dqdestroy(dqp); - return -ESRCH; + dqp = dqp1; + xfs_dqlock(dqp); + goto dqret; } + } else { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return -ESRCH; } - mutex_lock(&qi->qi_tree_lock); - error = radix_tree_insert(tree, id, dqp); - if (unlikely(error)) { - WARN_ON(error != -EEXIST); - + error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + if (error) { /* * Duplicate found. Just throw away the new dquot and start * over. */ - mutex_unlock(&qi->qi_tree_lock); - trace_xfs_dqget_dup(dqp); xfs_qm_dqdestroy(dqp); XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } - /* - * We return a locked dquot to the caller, with a reference taken - */ - xfs_dqlock(dqp); - dqp->q_nrefs = 1; +dqret: + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return 0; +} - qi->qi_dquots++; - mutex_unlock(&qi->qi_tree_lock); +/* + * Starting at @id and progressing upwards, look for an initialized incore + * dquot, lock it, and return it. + */ +int +xfs_qm_dqget_next( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_dquot **dqpp) +{ + struct xfs_dquot *dqp; + int error = 0; - /* If we are asked to find next active id, keep looking */ - if (flags & XFS_QMOPT_DQNEXT) { - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_qm_dqput(dqp); - error = xfs_dq_get_next_id(mp, type, &id); - if (error) - return error; - goto restart; + *dqpp = NULL; + for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) { + error = xfs_qm_dqget(mp, id, type, false, &dqp); + if (error == -ENOENT) + continue; + else if (error != 0) + break; + + if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + *dqpp = dqp; + return 0; } + + xfs_qm_dqput(dqp); } - dqret: - ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); - trace_xfs_dqget_miss(dqp); - *O_dqpp = dqp; - return 0; + return error; } /* @@ -913,9 +1062,9 @@ xfs_qm_dqflush_done( * since it's cheaper, and then we recheck while * holding the lock before removing the dquot from the AIL. */ - if ((lip->li_flags & XFS_LI_IN_AIL) && + if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && ((lip->li_lsn == qip->qli_flush_lsn) || - (lip->li_flags & XFS_LI_FAILED))) { + test_bit(XFS_LI_FAILED, &lip->li_flags))) { /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->ail_lock); @@ -926,8 +1075,7 @@ xfs_qm_dqflush_done( * Clear the failed state since we are about to drop the * flush lock */ - if (lip->li_flags & XFS_LI_FAILED) - xfs_clear_li_failed(lip); + xfs_clear_li_failed(lip); spin_unlock(&ailp->ail_lock); } } @@ -953,6 +1101,7 @@ xfs_qm_dqflush( { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; + struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddqp; xfs_failaddr_t fa; int error; @@ -996,12 +1145,13 @@ xfs_qm_dqflush( /* * Calculate the location of the dquot inside the buffer. */ - ddqp = bp->b_addr + dqp->q_bufoffset; + dqb = bp->b_addr + dqp->q_bufoffset; + ddqp = &dqb->dd_diskdq; /* - * A simple sanity check in case we got a corrupted dquot.. + * A simple sanity check in case we got a corrupted dquot. */ - fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 0); + fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", be32_to_cpu(ddqp->d_id), fa); @@ -1032,8 +1182,6 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; - dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -1119,3 +1267,35 @@ xfs_qm_exit(void) kmem_zone_destroy(xfs_qm_dqtrxzone); kmem_zone_destroy(xfs_qm_dqzone); } + +/* + * Iterate every dquot of a particular type. The caller must ensure that the + * particular quota type is active. iter_fn can return negative error codes, + * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating. + */ +int +xfs_qm_dqiterate( + struct xfs_mount *mp, + uint dqtype, + xfs_qm_dqiterate_fn iter_fn, + void *priv) +{ + struct xfs_dquot *dq; + xfs_dqid_t id = 0; + int error; + + do { + error = xfs_qm_dqget_next(mp, id, dqtype, &dq); + if (error == -ENOENT) + return 0; + if (error) + return error; + + error = iter_fn(dq, dqtype, priv); + id = be32_to_cpu(dq->q_core.d_id); + xfs_qm_dqput(dq); + id++; + } while (error == 0 && id != 0); + + return error; +} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 2f536f33cd26..bdd6bd921528 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -160,8 +160,6 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, - uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); @@ -169,8 +167,19 @@ extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, struct xfs_dquot *); -extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, - xfs_dqid_t, uint, uint, xfs_dquot_t **); +extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, + uint type); +extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, + uint type, bool can_alloc, + struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, + bool can_alloc, + struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, + uint type, struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_uncached(struct xfs_mount *mp, + xfs_dqid_t id, uint type, + struct xfs_dquot **dqpp); extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); @@ -185,4 +194,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } +typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype, + void *priv); +int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype, + xfs_qm_dqiterate_fn iter_fn, void *priv); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 4b331e354da7..8eb7415474d6 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -173,7 +173,7 @@ xfs_qm_dquot_logitem_push( * The buffer containing this item failed to be written back * previously. Resubmit the buffer for IO */ - if (lip->li_flags & XFS_LI_FAILED) { + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; @@ -209,10 +209,7 @@ xfs_qm_dquot_logitem_push( spin_unlock(&lip->li_ailp->ail_lock); error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(dqp->q_mount, "%s: push error %d on dqp "PTR_FMT, - __func__, error, dqp); - } else { + if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index a63f5083f497..7975634cb8fe 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -61,6 +61,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, + XFS_RANDOM_FORCE_SCRUB_REPAIR, }; struct xfs_errortag_attr { @@ -167,6 +168,7 @@ XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); +XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -201,6 +203,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), + XFS_ERRORTAG_ATTR_LIST(force_repair), NULL, }; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index b5b1e567b9f4..a889b550979a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -168,7 +168,7 @@ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_efi_release(EFI_ITEM(lip)); } @@ -402,7 +402,7 @@ xfs_efd_item_unlock( { struct xfs_efd_log_item *efdp = EFD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); } @@ -542,7 +542,7 @@ xfs_efi_recover( for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len, &oinfo); + extp->ext_len, &oinfo, false); if (error) goto abort_error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e70fb8ccecea..bed07dfbb85e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -312,7 +312,7 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, BREAK_WRITE); if (error) return error; @@ -414,6 +414,12 @@ xfs_dio_write_end_io( if (size <= 0) return size; + /* + * Capture amount written on completion as we can't reliably account + * for it on submission. + */ + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); + if (flags & IOMAP_DIO_COW) { error = xfs_reflink_end_cow(ip, offset, size); if (error) @@ -599,7 +605,16 @@ xfs_file_dax_write( } out: xfs_iunlock(ip, iolock); - return error ? error : ret; + if (error) + return error; + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } + return ret; } STATIC ssize_t @@ -669,6 +684,12 @@ write_retry: out: if (iolock) xfs_iunlock(ip, iolock); + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } return ret; } @@ -693,8 +714,9 @@ xfs_file_write_iter( return -EIO; if (IS_DAX(inode)) - ret = xfs_file_dax_write(iocb, from); - else if (iocb->ki_flags & IOCB_DIRECT) { + return xfs_file_dax_write(iocb, from); + + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered * write *only* in the case that we're doing a reflink @@ -702,20 +724,74 @@ xfs_file_write_iter( * allow an operation to fall back to buffered mode. */ ret = xfs_file_dio_aio_write(iocb, from); - if (ret == -EREMCHG) - goto buffered; - } else { -buffered: - ret = xfs_file_buffered_aio_write(iocb, from); + if (ret != -EREMCHG) + return ret; } - if (ret > 0) { - XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + return xfs_file_buffered_aio_write(iocb, from); +} - /* Handle various SYNC-type writes */ - ret = generic_write_sync(iocb, ret); - } - return ret; +static void +xfs_wait_dax_page( + struct inode *inode, + bool *did_unlock) +{ + struct xfs_inode *ip = XFS_I(inode); + + *did_unlock = true; + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); + schedule(); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); +} + +static int +xfs_break_dax_layouts( + struct inode *inode, + uint iolock, + bool *did_unlock) +{ + struct page *page; + + ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL)); + + page = dax_layout_busy_page(inode->i_mapping); + if (!page) + return 0; + + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, xfs_wait_dax_page(inode, did_unlock)); +} + +int +xfs_break_layouts( + struct inode *inode, + uint *iolock, + enum layout_break_reason reason) +{ + bool retry; + int error; + + ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); + + do { + retry = false; + switch (reason) { + case BREAK_UNMAP: + error = xfs_break_dax_layouts(inode, *iolock, &retry); + if (error || retry) + break; + /* fall through */ + case BREAK_WRITE: + error = xfs_break_leased_layouts(inode, iolock, &retry); + break; + default: + WARN_ON_ONCE(1); + error = -EINVAL; + } + } while (error == 0 && retry); + + return error; } #define XFS_FALLOC_FL_SUPPORTED \ @@ -734,7 +810,7 @@ xfs_file_fallocate( struct xfs_inode *ip = XFS_I(inode); long error; enum xfs_prealloc_flags flags = 0; - uint iolock = XFS_IOLOCK_EXCL; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; loff_t new_size = 0; bool do_file_insert = false; @@ -744,13 +820,10 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; - if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -1007,7 +1080,7 @@ xfs_file_llseek( * page_lock (MM) * i_lock (XFS - extent map serialisation) */ -static int +static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, enum page_entry_size pe_size, @@ -1015,7 +1088,7 @@ __xfs_filemap_fault( { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); - int ret; + vm_fault_t ret; trace_xfs_filemap_fault(ip, pe_size, write_fault); @@ -1044,7 +1117,7 @@ __xfs_filemap_fault( return ret; } -static int +static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) { @@ -1054,7 +1127,7 @@ xfs_filemap_fault( (vmf->flags & FAULT_FLAG_WRITE)); } -static int +static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, enum page_entry_size pe_size) @@ -1067,7 +1140,7 @@ xfs_filemap_huge_fault( (vmf->flags & FAULT_FLAG_WRITE)); } -static int +static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { @@ -1079,7 +1152,7 @@ xfs_filemap_page_mkwrite( * on write faults. In reality, it needs to serialise against truncate and * prepare memory for writing so handle is as standard write fault. */ -static int +static vm_fault_t xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 43cfc07996a4..0299febece9c 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -465,10 +465,9 @@ xfs_getfsmap_rtdev_rtbitmap_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); - - irec.rm_startblock = rec->ar_startblock; - irec.rm_blockcount = rec->ar_blockcount; + irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock); + irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ irec.rm_offset = 0; irec.rm_flags = 0; @@ -534,8 +533,11 @@ xfs_getfsmap_rtdev_rtbitmap_query( xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); - alow.ar_startblock = info->low.rm_startblock; - ahigh.ar_startblock = info->high.rm_startblock; + alow.ar_startext = info->low.rm_startblock; + ahigh.ar_startext = info->high.rm_startblock; + do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize); + if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize)) + ahigh.ar_startext++; error = xfs_rtalloc_query_range(tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 523792768080..bc7ef18da243 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,85 +24,42 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" -#include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_error.h" #include "xfs_btree.h" -#include "xfs_alloc_btree.h" #include "xfs_alloc.h" -#include "xfs_rmap_btree.h" -#include "xfs_ialloc.h" #include "xfs_fsops.h" -#include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" #include "xfs_trace.h" #include "xfs_log.h" -#include "xfs_filestream.h" -#include "xfs_rmap.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* - * File system operations + * growfs operations */ - -static struct xfs_buf * -xfs_growfs_get_hdr_buf( - struct xfs_mount *mp, - xfs_daddr_t blkno, - size_t numblks, - int flags, - const struct xfs_buf_ops *ops) -{ - struct xfs_buf *bp; - - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); - if (!bp) - return NULL; - - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - bp->b_bn = blkno; - bp->b_maps[0].bm_bn = blkno; - bp->b_ops = ops; - - return bp; -} - static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_data_t *in) /* growfs data input struct */ { - xfs_agf_t *agf; - struct xfs_agfl *agfl; - xfs_agi_t *agi; - xfs_agnumber_t agno; - xfs_extlen_t agsize; - xfs_extlen_t tmpsize; - xfs_alloc_rec_t *arec; xfs_buf_t *bp; - int bucket; - int dpct; - int error, saved_error = 0; + int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_mod; xfs_rfsblock_t new; - xfs_rfsblock_t nfree; xfs_agnumber_t oagcount; - int pct; xfs_trans_t *tp; + LIST_HEAD (buffer_list); + struct aghdr_init_data id = {}; nb = in->newblocks; - pct = in->imaxpct; - if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) + if (nb < mp->m_sb.sb_dblocks) return -EINVAL; if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; - dpct = pct - mp->m_sb.sb_imax_pct; error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); @@ -135,376 +92,45 @@ xfs_growfs_data_private( return error; /* - * Write new AG headers to disk. Non-transactional, but written - * synchronously so they are completed prior to the growfs transaction - * being logged. + * Write new AG headers to disk. Non-transactional, but need to be + * written and completed prior to the growfs transaction being logged. + * To do this, we use a delayed write buffer list and wait for + * submission and IO completion of the list as a whole. This allows the + * IO subsystem to merge all the AG headers in a single AG into a single + * IO and hide most of the latency of the IO from us. + * + * This also means that if we get an error whilst building the buffer + * list to write, we can cancel the entire list without having written + * anything. */ - nfree = 0; - for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { - __be32 *agfl_bno; - - /* - * AG freespace header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agf_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agf = XFS_BUF_TO_AGF(bp); - agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); - agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(agno); - if (agno == nagcount - 1) - agsize = - nb - - (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + INIT_LIST_HEAD(&id.buffer_list); + for (id.agno = nagcount - 1; + id.agno >= oagcount; + id.agno--, new -= id.agsize) { + + if (id.agno == nagcount - 1) + id.agsize = nb - + (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); else - agsize = mp->m_sb.sb_agblocks; - agf->agf_length = cpu_to_be32(agsize); - agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); - agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); - agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(XFS_RMAP_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); - agf->agf_rmap_blocks = cpu_to_be32(1); - } - - agf->agf_flfirst = cpu_to_be32(1); - agf->agf_fllast = 0; - agf->agf_flcount = 0; - tmpsize = agsize - mp->m_ag_prealloc_blocks; - agf->agf_freeblks = cpu_to_be32(tmpsize); - agf->agf_longest = cpu_to_be32(tmpsize); - if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - agf->agf_refcount_root = cpu_to_be32( - xfs_refc_block(mp)); - agf->agf_refcount_level = cpu_to_be32(1); - agf->agf_refcount_blocks = cpu_to_be32(1); - } - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * AG freelist header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agfl_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agfl = XFS_BUF_TO_AGFL(bp); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); - agfl->agfl_seqno = cpu_to_be32(agno); - uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); - } - - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); - for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) - agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * AG inode header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agi_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agi = XFS_BUF_TO_AGI(bp); - agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); - agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(agno); - agi->agi_length = cpu_to_be32(agsize); - agi->agi_count = 0; - agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); - agi->agi_level = cpu_to_be32(1); - agi->agi_freecount = 0; - agi->agi_newino = cpu_to_be32(NULLAGINO); - agi->agi_dirino = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); - agi->agi_free_level = cpu_to_be32(1); - } - for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) - agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; + id.agsize = mp->m_sb.sb_agblocks; - /* - * BNO btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_allocbt_buf_ops); - - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0); - - arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - arec->ar_blockcount = cpu_to_be32( - agsize - be32_to_cpu(arec->ar_startblock)); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * CNT btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_allocbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0); - - arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - arec->ar_blockcount = cpu_to_be32( - agsize - be32_to_cpu(arec->ar_startblock)); - nfree += be32_to_cpu(arec->ar_blockcount); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* RMAP btree root block */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - struct xfs_rmap_rec *rrec; - struct xfs_btree_block *block; - - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_rmapbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0, - agno, 0); - block = XFS_BUF_TO_BLOCK(bp); - - - /* - * mark the AG header regions as static metadata The BNO - * btree block is the first block after the headers, so - * it's location defines the size of region the static - * metadata consumes. - * - * Note: unlike mkfs, we never have to account for log - * space when growing the data regions - */ - rrec = XFS_RMAP_REC_ADDR(block, 1); - rrec->rm_startblock = 0; - rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account freespace btree root blocks */ - rrec = XFS_RMAP_REC_ADDR(block, 2); - rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(2); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account inode btree root blocks */ - rrec = XFS_RMAP_REC_ADDR(block, 3); - rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - - XFS_IBT_BLOCK(mp)); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account for rmap btree root */ - rrec = XFS_RMAP_REC_ADDR(block, 4); - rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(1); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account for refc btree root */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - rrec = XFS_RMAP_REC_ADDR(block, 5); - rrec->rm_startblock = cpu_to_be32( - xfs_refc_block(mp)); - rrec->rm_blockcount = cpu_to_be32(1); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - } - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - - /* - * INO btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_inobt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * FINO btree root block - */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_inobt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO, - 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - - /* - * refcount btree root block - */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_refcountbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC, - 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - } - xfs_trans_agblocks_delta(tp, nfree); - /* - * There are new blocks in the old last a.g. - */ - if (new) { - struct xfs_owner_info oinfo; - - /* - * Change the agi length. - */ - error = xfs_ialloc_read_agi(mp, tp, agno, &bp); - if (error) { - goto error0; - } - ASSERT(bp); - agi = XFS_BUF_TO_AGI(bp); - be32_add_cpu(&agi->agi_length, new); - ASSERT(nagcount == oagcount || - be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); - xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); - /* - * Change agf length. - */ - error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp); + error = xfs_ag_init_headers(mp, &id); if (error) { - goto error0; + xfs_buf_delwri_cancel(&id.buffer_list); + goto out_trans_cancel; } - ASSERT(bp); - agf = XFS_BUF_TO_AGF(bp); - be32_add_cpu(&agf->agf_length, new); - ASSERT(be32_to_cpu(agf->agf_length) == - be32_to_cpu(agi->agi_length)); + } + error = xfs_buf_delwri_submit(&id.buffer_list); + if (error) + goto out_trans_cancel; - xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + xfs_trans_agblocks_delta(tp, id.nfree); - /* - * Free the new space. - * - * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that - * this doesn't actually exist in the rmap btree. - */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); - error = xfs_rmap_free(tp, bp, agno, - be32_to_cpu(agf->agf_length) - new, - new, &oinfo); - if (error) - goto error0; - error = xfs_free_extent(tp, - XFS_AGB_TO_FSB(mp, agno, - be32_to_cpu(agf->agf_length) - new), - new, &oinfo, XFS_AG_RESV_NONE); + /* If there are new blocks in the old last AG, extend it. */ + if (new) { + error = xfs_ag_extend_space(mp, tp, &id, new); if (error) - goto error0; + goto out_trans_cancel; } /* @@ -517,10 +143,8 @@ xfs_growfs_data_private( if (nb > mp->m_sb.sb_dblocks) xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, nb - mp->m_sb.sb_dblocks); - if (nfree) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); - if (dpct) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + if (id.nfree) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -529,12 +153,6 @@ xfs_growfs_data_private( /* New allocation groups fully initialized, so update mount struct */ if (nagimax) mp->m_maxagi = nagimax; - if (mp->m_sb.sb_imax_pct) { - uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; - do_div(icount, 100); - mp->m_maxicount = icount << mp->m_sb.sb_inopblog; - } else - mp->m_maxicount = 0; xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); @@ -545,73 +163,24 @@ xfs_growfs_data_private( if (new) { struct xfs_perag *pag; - pag = xfs_perag_get(mp, agno); + pag = xfs_perag_get(mp, id.agno); error = xfs_ag_resv_free(pag); xfs_perag_put(pag); if (error) - goto out; + return error; } - /* Reserve AG metadata blocks. */ + /* + * Reserve AG metadata blocks. ENOSPC here does not mean there was a + * growfs failure, just that there still isn't space for new user data + * after the grow has been run. + */ error = xfs_fs_reserve_ag_blocks(mp); - if (error && error != -ENOSPC) - goto out; - - /* update secondary superblocks. */ - for (agno = 1; agno < nagcount; agno++) { + if (error == -ENOSPC) error = 0; - /* - * new secondary superblocks need to be zeroed, not read from - * disk as the contents of the new area we are growing into is - * completely unknown. - */ - if (agno < oagcount) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, - &xfs_sb_buf_ops); - } else { - bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (bp) { - bp->b_ops = &xfs_sb_buf_ops; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - } else - error = -ENOMEM; - } - - /* - * If we get an error reading or writing alternate superblocks, - * continue. xfs_repair chooses the "best" superblock based - * on most matches; if we break early, we'll leave more - * superblocks un-updated than updated, and xfs_repair may - * pick them over the properly-updated primary. - */ - if (error) { - xfs_warn(mp, - "error %d reading secondary superblock for ag %d", - error, agno); - saved_error = error; - continue; - } - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) { - xfs_warn(mp, - "write error %d updating secondary superblock for ag %d", - error, agno); - saved_error = error; - continue; - } - } - - out: - return saved_error ? saved_error : error; + return error; - error0: +out_trans_cancel: xfs_trans_cancel(tp); return error; } @@ -638,25 +207,71 @@ xfs_growfs_log_private( return -ENOSYS; } +static int +xfs_growfs_imaxpct( + struct xfs_mount *mp, + __u32 imaxpct) +{ + struct xfs_trans *tp; + int dpct; + int error; + + if (imaxpct > 100) + return -EINVAL; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + dpct = imaxpct - mp->m_sb.sb_imax_pct; + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp); +} + /* * protected versions of growfs function acquire and release locks on the mount * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, * XFS_IOC_FSGROWFSRT */ - - int xfs_growfs_data( - xfs_mount_t *mp, - xfs_growfs_data_t *in) + struct xfs_mount *mp, + struct xfs_growfs_data *in) { - int error; + int error = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; - error = xfs_growfs_data_private(mp, in); + + /* update imaxpct separately to the physical grow of the filesystem */ + if (in->imaxpct != mp->m_sb.sb_imax_pct) { + error = xfs_growfs_imaxpct(mp, in->imaxpct); + if (error) + goto out_error; + } + + if (in->newblocks != mp->m_sb.sb_dblocks) { + error = xfs_growfs_data_private(mp, in); + if (error) + goto out_error; + } + + /* Post growfs calculations needed to reflect new state in operations */ + if (mp->m_sb.sb_imax_pct) { + uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + do_div(icount, 100); + mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + } else + mp->m_maxicount = 0; + + /* Update secondary superblocks now the physical grow has completed */ + error = xfs_update_secondary_sbs(mp); + +out_error: /* * Increment the generation unconditionally, the error could be from * updating the secondary superblocks, in which case the new size diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 3e1cc3001bcb..fdde17a2333c 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -47,6 +47,7 @@ xfs_param_t xfs_params = { struct xfs_globals xfs_globals = { .log_recovery_delay = 0, /* no delay by default */ + .mount_delay = 0, /* no delay by default */ #ifdef XFS_ASSERT_FATAL .bug_on_assert = true, /* assert failures BUG() */ #else diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9a18f69f6e96..164350d91efc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -107,7 +107,8 @@ xfs_inode_free_callback( xfs_idestroy_fork(ip, XFS_COW_FORK); if (ip->i_itemp) { - ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, + &ip->i_itemp->ili_item.li_flags)); xfs_inode_item_destroy(ip); ip->i_itemp = NULL; } @@ -309,6 +310,46 @@ xfs_reinit_inode( } /* + * If we are allocating a new inode, then check what was returned is + * actually a free, empty inode. If we are not allocating an inode, + * then check we didn't find a free inode. + * + * Returns: + * 0 if the inode free state matches the lookup context + * -ENOENT if the inode is free and we are not allocating + * -EFSCORRUPTED if there is any state mismatch at all + */ +static int +xfs_iget_check_free_state( + struct xfs_inode *ip, + int flags) +{ + if (flags & XFS_IGET_CREATE) { + /* should be a free inode */ + if (VFS_I(ip)->i_mode != 0) { + xfs_warn(ip->i_mount, +"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", + ip->i_ino, VFS_I(ip)->i_mode); + return -EFSCORRUPTED; + } + + if (ip->i_d.di_nblocks != 0) { + xfs_warn(ip->i_mount, +"Corruption detected! Free inode 0x%llx has blocks allocated!", + ip->i_ino); + return -EFSCORRUPTED; + } + return 0; + } + + /* should be an allocated inode */ + if (VFS_I(ip)->i_mode == 0) + return -ENOENT; + + return 0; +} + +/* * Check the validity of the inode we just found it the cache */ static int @@ -357,12 +398,12 @@ xfs_iget_cache_hit( } /* - * If lookup is racing with unlink return an error immediately. + * Check the inode free state is valid. This also detects lookup + * racing with unlinks. */ - if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = -ENOENT; + error = xfs_iget_check_free_state(ip, flags); + if (error) goto out_error; - } /* * If IRECLAIMABLE is set, we've torn down the VFS inode already. @@ -485,29 +526,12 @@ xfs_iget_cache_miss( /* - * If we are allocating a new inode, then check what was returned is - * actually a free, empty inode. If we are not allocating an inode, - * the check we didn't find a free inode. + * Check the inode free state is valid. This also detects lookup + * racing with unlinks. */ - if (flags & XFS_IGET_CREATE) { - if (VFS_I(ip)->i_mode != 0) { - xfs_warn(mp, -"Corruption detected! Free inode 0x%llx not marked free on disk", - ino); - error = -EFSCORRUPTED; - goto out_destroy; - } - if (ip->i_d.di_nblocks != 0) { - xfs_warn(mp, -"Corruption detected! Free inode 0x%llx has blocks allocated!", - ino); - error = -EFSCORRUPTED; - goto out_destroy; - } - } else if (VFS_I(ip)->i_mode == 0) { - error = -ENOENT; + error = xfs_iget_check_free_state(ip, flags); + if (error) goto out_destroy; - } /* * Preload the radix tree so we can insert safely under the @@ -1802,3 +1826,21 @@ xfs_inode_clear_cowblocks_tag( return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } + +/* Disable post-EOF and CoW block auto-reclamation. */ +void +xfs_icache_disable_reclaim( + struct xfs_mount *mp) +{ + cancel_delayed_work_sync(&mp->m_eofblocks_work); + cancel_delayed_work_sync(&mp->m_cowblocks_work); +} + +/* Enable post-EOF and CoW block auto-reclamation. */ +void +xfs_icache_enable_reclaim( + struct xfs_mount *mp) +{ + xfs_queue_eofblocks(mp); + xfs_queue_cowblocks(mp); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d4a77588eca1..d69a0f5a6a73 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -131,4 +131,7 @@ xfs_fs_eofblocks_from_user( int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); +void xfs_icache_disable_reclaim(struct xfs_mount *mp); +void xfs_icache_enable_reclaim(struct xfs_mount *mp); + #endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 865ad1373e5e..5da9599156ed 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -91,7 +91,7 @@ xfs_icreate_item_unlock( { struct xfs_icreate_item *icp = ICR_ITEM(lip); - if (icp->ic_item.li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) kmem_zone_free(xfs_icreate_zone, icp); return; } @@ -184,5 +184,5 @@ xfs_icreate_log( xfs_trans_add_item(tp, &icp->ic_item); tp->t_flags |= XFS_TRANS_DIRTY; - icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2b70c8b4cee2..05207a64dd53 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -498,7 +498,7 @@ again: if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) + if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) try_lock++; } } @@ -598,7 +598,7 @@ xfs_lock_two_inodes( * and try again. */ lp = (xfs_log_item_t *)ip0->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { xfs_iunlock(ip0, ip0_mode); if ((++attempts % 5) == 0) @@ -791,6 +791,18 @@ xfs_ialloc( ASSERT(*ialloc_context == NULL); /* + * Protect against obviously corrupt allocation btree records. Later + * xfs_iget checks will catch re-allocation of other active in-memory + * and on-disk inodes. If we don't catch reallocating the parent inode + * here we will deadlock in xfs_iget() so we have to do these checks + * first. + */ + if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { + xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + return -EFSCORRUPTED; + } + + /* * Get the in-core inode with the lock held exclusively. * This is because we're setting fields here we need * to prevent others from looking at until we're done. @@ -1196,6 +1208,7 @@ xfs_create( unlock_dp_on_error = true; xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Reserve disk quota and the inode. @@ -1411,11 +1424,11 @@ xfs_link( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(sip, 0); + error = xfs_qm_dqattach(sip); if (error) goto std_return; - error = xfs_qm_dqattach(tdp, 0); + error = xfs_qm_dqattach(tdp); if (error) goto std_return; @@ -1451,6 +1464,7 @@ xfs_link( } xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Handle initial link state of O_TMPFILE inode @@ -1534,11 +1548,12 @@ xfs_itruncate_clear_reflink_flags( * dirty on error so that transactions can be easily aborted if possible. */ int -xfs_itruncate_extents( +xfs_itruncate_extents_flags( struct xfs_trans **tpp, struct xfs_inode *ip, int whichfork, - xfs_fsize_t new_size) + xfs_fsize_t new_size, + int flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; @@ -1561,6 +1576,8 @@ xfs_itruncate_extents( trace_xfs_itruncate_extents_start(ip, new_size); + flags |= xfs_bmapi_aflag(whichfork); + /* * Since it is possible for space to become allocated beyond * the end of the file (in a crash where the space is allocated @@ -1579,12 +1596,9 @@ xfs_itruncate_extents( unmap_len = last_block - first_unmap_block + 1; while (!done) { xfs_defer_init(&dfops, &first_block); - error = xfs_bunmapi(tp, ip, - first_unmap_block, unmap_len, - xfs_bmapi_aflag(whichfork), - XFS_ITRUNC_MAX_EXTENTS, - &first_block, &dfops, - &done); + error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, + XFS_ITRUNC_MAX_EXTENTS, &first_block, + &dfops, &done); if (error) goto out_bmap_cancel; @@ -1811,6 +1825,7 @@ xfs_inactive_ifree( xfs_trans_ijoin(tp, ip, 0); xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; error = xfs_ifree(tp, ip, &dfops); if (error) { /* @@ -1911,7 +1926,7 @@ xfs_inactive( ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return; @@ -2574,11 +2589,11 @@ xfs_remove( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) goto std_return; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) goto std_return; @@ -2647,6 +2662,7 @@ xfs_remove( goto out_trans_cancel; xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &dfops, resblks); if (error) { @@ -3014,6 +3030,7 @@ xfs_rename( } xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1eebc53df7d7..a91d9fb1effc 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -379,6 +379,20 @@ static inline void xfs_ifunlock(struct xfs_inode *ip) >> XFS_ILOCK_SHIFT) /* + * Layouts are broken in the BREAK_WRITE case to ensure that + * layout-holders do not collide with local writes. Additionally, + * layouts are broken in the BREAK_UNMAP case to make sure the + * layout-holder has a consistent view of the file's extent map. While + * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases, + * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to + * go idle. + */ +enum layout_break_reason { + BREAK_WRITE, + BREAK_UNMAP, +}; + +/* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and * new subdirectory gets S_ISGID bit from parent. @@ -415,8 +429,8 @@ uint xfs_ilock_attr_map_shared(struct xfs_inode *); uint xfs_ip2xflags(struct xfs_inode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_defer_ops *); -int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, - int, xfs_fsize_t); +int xfs_itruncate_extents_flags(struct xfs_trans **, + struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); @@ -433,6 +447,16 @@ int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, dev_t, prid_t, struct xfs_inode **); +static inline int +xfs_itruncate_extents( + struct xfs_trans **tpp, + struct xfs_inode *ip, + int whichfork, + xfs_fsize_t new_size) +{ + return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); +} + /* from xfs_file.c */ enum xfs_prealloc_flags { XFS_PREALLOC_SET = (1 << 1), @@ -443,6 +467,8 @@ enum xfs_prealloc_flags { int xfs_update_prealloc_flags(struct xfs_inode *ip, enum xfs_prealloc_flags flags); +int xfs_break_layouts(struct inode *inode, uint *iolock, + enum layout_break_reason reason); /* from xfs_iops.c */ extern void xfs_setup_inode(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 34b91b789702..3e5b8574818e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -518,7 +518,7 @@ xfs_inode_item_push( * The buffer containing this item failed to be written back * previously. Resubmit the buffer for IO. */ - if (lip->li_flags & XFS_LI_FAILED) { + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; @@ -729,14 +729,14 @@ xfs_iflush_done( */ iip = INODE_ITEM(blip); if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - (blip->li_flags & XFS_LI_FAILED)) + test_bit(XFS_LI_FAILED, &blip->li_flags)) need_ail++; } /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || - lip->li_flags & XFS_LI_FAILED) + test_bit(XFS_LI_FAILED, &lip->li_flags)) need_ail++; /* @@ -803,7 +803,7 @@ xfs_iflush_abort( xfs_inode_log_item_t *iip = ip->i_itemp; if (iip) { - if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) { xfs_trans_ail_remove(&iip->ili_item, stale ? SHUTDOWN_LOG_IO_ERROR : SHUTDOWN_CORRUPT_INCORE); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 89fb1eb80aae..32b680522abd 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -39,7 +39,6 @@ #include "xfs_icache.h" #include "xfs_symlink.h" #include "xfs_trans.h" -#include "xfs_pnfs.h" #include "xfs_acl.h" #include "xfs_btree.h" #include <linux/fsmap.h> @@ -614,7 +613,7 @@ xfs_ioc_space( struct xfs_inode *ip = XFS_I(inode); struct iattr iattr; enum xfs_prealloc_flags flags = 0; - uint iolock = XFS_IOLOCK_EXCL; + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; int error; /* @@ -644,13 +643,10 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) goto out_unlock; - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; - switch (bf->l_whence) { case 0: /*SEEK_SET*/ break; @@ -1103,7 +1099,8 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (bdev_dax_supported(sb, sb->s_blocksize) < 0) + if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), + sb->s_blocksize)) return -EINVAL; } @@ -1811,6 +1808,88 @@ xfs_ioc_swapext( return error; } +static int +xfs_ioc_getlabel( + struct xfs_mount *mp, + char __user *user_label) +{ + struct xfs_sb *sbp = &mp->m_sb; + char label[XFSLABEL_MAX + 1]; + + /* Paranoia */ + BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX); + + spin_lock(&mp->m_sb_lock); + strncpy(label, sbp->sb_fname, sizeof(sbp->sb_fname)); + spin_unlock(&mp->m_sb_lock); + + /* xfs on-disk label is 12 chars, be sure we send a null to user */ + label[XFSLABEL_MAX] = '\0'; + if (copy_to_user(user_label, label, sizeof(sbp->sb_fname))) + return -EFAULT; + return 0; +} + +static int +xfs_ioc_setlabel( + struct file *filp, + struct xfs_mount *mp, + char __user *newlabel) +{ + struct xfs_sb *sbp = &mp->m_sb; + char label[XFSLABEL_MAX + 1]; + size_t len; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * The generic ioctl allows up to FSLABEL_MAX chars, but XFS is much + * smaller, at 12 bytes. We copy one more to be sure we find the + * (required) NULL character to test the incoming label length. + * NB: The on disk label doesn't need to be null terminated. + */ + if (copy_from_user(label, newlabel, XFSLABEL_MAX + 1)) + return -EFAULT; + len = strnlen(label, XFSLABEL_MAX + 1); + if (len > sizeof(sbp->sb_fname)) + return -EINVAL; + + error = mnt_want_write_file(filp); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + memset(sbp->sb_fname, 0, sizeof(sbp->sb_fname)); + strncpy(sbp->sb_fname, label, sizeof(sbp->sb_fname)); + spin_unlock(&mp->m_sb_lock); + + /* + * Now we do several things to satisfy userspace. + * In addition to normal logging of the primary superblock, we also + * immediately write these changes to sector zero for the primary, then + * update all backup supers (as xfs_db does for a label change), then + * invalidate the block device page cache. This is so that any prior + * buffered reads from userspace (i.e. from blkid) are invalidated, + * and userspace will see the newly-written label. + */ + error = xfs_sync_sb_buf(mp); + if (error) + goto out; + /* + * growfs also updates backup supers so lock against that. + */ + mutex_lock(&mp->m_growlock); + error = xfs_update_secondary_sbs(mp); + mutex_unlock(&mp->m_growlock); + + invalidate_bdev(mp->m_ddev_targp->bt_bdev); + +out: + mnt_drop_write_file(filp); + return error; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -1834,6 +1913,10 @@ xfs_file_ioctl( switch (cmd) { case FITRIM: return xfs_ioc_trim(mp, arg); + case FS_IOC_GETFSLABEL: + return xfs_ioc_getlabel(mp, arg); + case FS_IOC_SETFSLABEL: + return xfs_ioc_setlabel(filp, mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 046469fcc1b8..c6ce6f9335b6 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -224,7 +224,7 @@ xfs_iomap_write_direct( * necessary and move on to transaction setup. */ xfs_iunlock(ip, lockmode); - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -576,7 +576,7 @@ xfs_file_iomap_begin_delay( goto done; } - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; @@ -692,7 +692,7 @@ xfs_iomap_write_allocate( /* * Make sure that the dquots are there. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -946,8 +946,11 @@ error_on_bmapi_transaction: return error; } -static inline bool imap_needs_alloc(struct inode *inode, - struct xfs_bmbt_irec *imap, int nimaps) +static inline bool +imap_needs_alloc( + struct inode *inode, + struct xfs_bmbt_irec *imap, + int nimaps) { return !nimaps || imap->br_startblock == HOLESTARTBLOCK || @@ -955,31 +958,58 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } -static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps) +static inline bool +needs_cow_for_zeroing( + struct xfs_bmbt_irec *imap, + int nimaps) { return nimaps && imap->br_startblock != HOLESTARTBLOCK && imap->br_state != XFS_EXT_UNWRITTEN; } -static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) +static int +xfs_ilock_for_iomap( + struct xfs_inode *ip, + unsigned flags, + unsigned *lockmode) { + unsigned mode = XFS_ILOCK_SHARED; + /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) - return true; + if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) { + /* + * FIXME: It could still overwrite on unshared extents and not + * need allocation. + */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + mode = XFS_ILOCK_EXCL; + } /* - * Extents not yet cached requires exclusive access, don't block. - * This is an opencoded xfs_ilock_data_map_shared() to cater for the + * Extents not yet cached requires exclusive access, don't block. This + * is an opencoded xfs_ilock_data_map_shared() call but with * non-blocking behaviour. */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - !(ip->i_df.if_flags & XFS_IFEXTENTS)) - return true; - return false; + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + mode = XFS_ILOCK_EXCL; + } + + if (flags & IOMAP_NOWAIT) { + if (!xfs_ilock_nowait(ip, mode)) + return -EAGAIN; + } else { + xfs_ilock(ip, mode); + } + + *lockmode = mode; + return 0; } static int @@ -1007,19 +1037,15 @@ xfs_file_iomap_begin( return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } - if (need_excl_ilock(ip, flags)) - lockmode = XFS_ILOCK_EXCL; - else - lockmode = XFS_ILOCK_SHARED; - - if (flags & IOMAP_NOWAIT) { - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) - return -EAGAIN; - if (!xfs_ilock_nowait(ip, lockmode)) - return -EAGAIN; - } else { - xfs_ilock(ip, lockmode); - } + /* + * Lock the inode in the manner required for the specified operation and + * check for as many conditions that would result in blocking as + * possible. This removes most of the non-blocking checks from the + * mapping code below. + */ + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; ASSERT(offset <= mp->m_super->s_maxbytes); if (offset > mp->m_super->s_maxbytes - length) @@ -1040,19 +1066,21 @@ xfs_file_iomap_begin( goto out_unlock; } - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) { + /* Non-modifying mapping requested, so we are done */ + if (!(flags & (IOMAP_WRITE | IOMAP_ZERO))) + goto out_found; + + /* + * Break shared extents if necessary. Checks for non-blocking IO have + * been done up front, so we don't need to do them here. + */ + if (xfs_is_reflink_inode(ip)) { + /* if zeroing doesn't need COW allocation, then we are done. */ + if ((flags & IOMAP_ZERO) && + !needs_cow_for_zeroing(&imap, nimaps)) + goto out_found; + if (flags & IOMAP_DIRECT) { - /* - * A reflinked inode will result in CoW alloc. - * FIXME: It could still overwrite on unshared extents - * and not need allocation. - */ - if (flags & IOMAP_NOWAIT) { - error = -EAGAIN; - goto out_unlock; - } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode); @@ -1068,46 +1096,45 @@ xfs_file_iomap_begin( length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { - /* - * If nowait is set bail since we are going to make - * allocations. - */ - if (flags & IOMAP_NOWAIT) { - error = -EAGAIN; - goto out_unlock; - } - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES - * pages to keep the chunks of work done where somewhat symmetric - * with the work writeback does. This is a completely arbitrary - * number pulled out of thin air as a best guess for initial - * testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - length = min_t(loff_t, length, 1024 * PAGE_SIZE); - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); - if (error) - return error; + /* Don't need to allocate over holes when doing zeroing operations. */ + if (flags & IOMAP_ZERO) + goto out_found; - iomap->flags = IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); - } else { - ASSERT(nimaps); + if (!imap_needs_alloc(inode, &imap, nimaps)) + goto out_found; - xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + /* If nowait is set bail since we are going to make allocations. */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; } + /* + * We cap the maximum length we map to a sane size to keep the chunks + * of work done where somewhat symmetric with the work writeback does. + * This is a completely arbitrary number pulled out of thin air as a + * best guess for initial testing. + * + * Note that the values needs to be less than 32-bits wide until the + * lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + + /* + * xfs_iomap_write_direct() expects the shared lock. It is unlocked on + * return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + if (error) + return error; + + iomap->flags = IOMAP_F_NEW; + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + +out_finish: if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; @@ -1117,6 +1144,13 @@ xfs_file_iomap_begin( if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; + +out_found: + ASSERT(nimaps); + xfs_iunlock(ip, lockmode); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + goto out_finish; + out_unlock: xfs_iunlock(ip, lockmode); return error; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index df42e4cb4dc4..3b4be06fdaa5 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -37,7 +37,6 @@ #include "xfs_da_btree.h" #include "xfs_dir2.h" #include "xfs_trans_space.h" -#include "xfs_pnfs.h" #include "xfs_iomap.h" #include <linux/capability.h> @@ -855,7 +854,7 @@ xfs_setattr_size( /* * Make sure that the dquots are attached to the inode. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -1030,14 +1029,19 @@ xfs_vn_setattr( int error; if (iattr->ia_valid & ATTR_SIZE) { - struct xfs_inode *ip = XFS_I(d_inode(dentry)); - uint iolock = XFS_IOLOCK_EXCL; + struct inode *inode = d_inode(dentry); + struct xfs_inode *ip = XFS_I(inode); + uint iolock; - error = xfs_break_layouts(d_inode(dentry), &iolock); - if (error) + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + + error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); + if (error) { + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); return error; + } - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); error = xfs_vn_setattr_size(dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { @@ -1195,6 +1199,30 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; +/* Figure out if this file actually supports DAX. */ +static bool +xfs_inode_supports_dax( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* Only supported on non-reflinked files. */ + if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip)) + return false; + + /* DAX mount option or DAX iflag must be set. */ + if (!(mp->m_flags & XFS_MOUNT_DAX) && + !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + return false; + + /* Block size must match page size */ + if (mp->m_sb.sb_blocksize != PAGE_SIZE) + return false; + + /* Device has to support DAX too. */ + return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL; +} + STATIC void xfs_diflags_to_iflags( struct inode *inode, @@ -1213,11 +1241,7 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - if (S_ISREG(inode->i_mode) && - ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && - !xfs_is_reflink_inode(ip) && - (ip->i_mount->m_flags & XFS_MOUNT_DAX || - ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + if (xfs_inode_supports_dax(ip)) inode->i_flags |= S_DAX; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2fcd9ed5d075..c21039f27e39 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1047,6 +1047,7 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_ail); INIT_LIST_HEAD(&item->li_cil); INIT_LIST_HEAD(&item->li_bio_list); + INIT_LIST_HEAD(&item->li_trans); } /* @@ -2110,10 +2111,10 @@ xlog_print_tic_res( */ void xlog_print_trans( - struct xfs_trans *tp) + struct xfs_trans *tp) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_log_item_desc *lidp; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item *lip; /* dump core transaction and ticket info */ xfs_warn(mp, "transaction summary:"); @@ -2124,15 +2125,14 @@ xlog_print_trans( xlog_print_tic_res(mp, tp->t_ticket); /* dump each log item */ - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv = lip->li_lv; struct xfs_log_iovec *vec; int i; xfs_warn(mp, "log item: "); xfs_warn(mp, " type = 0x%x", lip->li_type); - xfs_warn(mp, " flags = 0x%x", lip->li_flags); + xfs_warn(mp, " flags = 0x%lx", lip->li_flags); if (!lv) continue; xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4668403b1741..c15687724728 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -141,10 +141,9 @@ xlog_cil_alloc_shadow_bufs( struct xlog *log, struct xfs_trans *tp) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv; int niovecs = 0; int nbytes = 0; @@ -152,7 +151,7 @@ xlog_cil_alloc_shadow_bufs( bool ordered = false; /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* get number of vecs and size of data to be stored */ @@ -317,7 +316,7 @@ xlog_cil_insert_format_items( int *diff_len, int *diff_iovecs) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; /* Bail out if we didn't find a log item. */ @@ -326,15 +325,14 @@ xlog_cil_insert_format_items( return; } - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv; struct xfs_log_vec *old_lv = NULL; struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* @@ -406,7 +404,7 @@ xlog_cil_insert_items( { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; int len = 0; int diff_iovecs = 0; int iclog_space; @@ -479,11 +477,10 @@ xlog_cil_insert_items( * We do this here so we only need to take the CIL lock once during * the transaction commit. */ - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* @@ -1013,6 +1010,7 @@ xfs_log_commit_cil( *commit_lsn = xc_commit_lsn; xfs_log_done(mp, tp->t_ticket, NULL, regrant); + tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 2b2383f1895e..06a09cb948b5 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2702,7 +2702,7 @@ xlog_recover_do_reg_buffer( goto next; } fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0, 0); + -1, 0); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", @@ -3348,7 +3348,7 @@ xlog_recover_dquot_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0, 0); + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", dq_f->qlf_id, fa); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index a901b86772f8..73ed8fec0328 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1072,9 +1072,7 @@ xfs_unmountfs( uint64_t resblks; int error; - cancel_delayed_work_sync(&mp->m_eofblocks_work); - cancel_delayed_work_sync(&mp->m_cowblocks_work); - + xfs_icache_disable_reclaim(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index aa6c5c193f45..f44c3599527d 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -31,19 +31,20 @@ * rules in the page fault path we don't bother. */ int -xfs_break_layouts( +xfs_break_leased_layouts( struct inode *inode, - uint *iolock) + uint *iolock, + bool *did_unlock) { struct xfs_inode *ip = XFS_I(inode); int error; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)); - while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); + *did_unlock = true; error = break_layout(inode, true); - *iolock = XFS_IOLOCK_EXCL; + *iolock &= ~XFS_IOLOCK_SHARED; + *iolock |= XFS_IOLOCK_EXCL; xfs_ilock(ip, *iolock); } @@ -120,8 +121,8 @@ xfs_fs_map_blocks( * Lock out any other I/O before we flush and invalidate the pagecache, * and then hand out a layout to the remote system. This is very * similar to direct I/O, except that the synchronization is much more - * complicated. See the comment near xfs_break_layouts for a detailed - * explanation. + * complicated. See the comment near xfs_break_leased_layouts + * for a detailed explanation. */ xfs_ilock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index bf45951e28fe..940c6c2ad88c 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -9,10 +9,11 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock); +int xfs_break_leased_layouts(struct inode *inode, uint *iolock, + bool *did_unlock); #else static inline int -xfs_break_layouts(struct inode *inode, uint *iolock) +xfs_break_leased_layouts(struct inode *inode, uint *iolock, bool *did_unlock) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index ec39ae274c78..c3e014bfc848 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -161,10 +161,7 @@ xfs_qm_dqpurge( * to purge this dquot anyway, so we go ahead regardless. */ error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(mp, "%s: dquot "PTR_FMT" flush failed", - __func__, dqp); - } else { + if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); } @@ -173,7 +170,7 @@ xfs_qm_dqpurge( ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || - !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); @@ -265,7 +262,7 @@ xfs_qm_dqattach_one( xfs_inode_t *ip, xfs_dqid_t id, uint type, - uint doalloc, + bool doalloc, xfs_dquot_t **IO_idqpp) { xfs_dquot_t *dqp; @@ -291,7 +288,7 @@ xfs_qm_dqattach_one( * exist on disk and we didn't ask it to allocate; ESRCH if quotas got * turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc, &dqp); + error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp); if (error) return error; @@ -326,14 +323,14 @@ xfs_qm_need_dqattach( /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON * into account. - * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. + * If @doalloc is true, the dquot(s) will be allocated if needed. * Inode may get unlocked and relocked in here, and the caller must deal with * the consequences. */ int xfs_qm_dqattach_locked( xfs_inode_t *ip, - uint flags) + bool doalloc) { xfs_mount_t *mp = ip->i_mount; int error = 0; @@ -345,8 +342,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - flags & XFS_QMOPT_DQALLOC, - &ip->i_udquot); + doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); @@ -354,8 +350,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - flags & XFS_QMOPT_DQALLOC, - &ip->i_gdquot); + doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); @@ -363,8 +358,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, - flags & XFS_QMOPT_DQALLOC, - &ip->i_pdquot); + doalloc, &ip->i_pdquot); if (error) goto done; ASSERT(ip->i_pdquot); @@ -381,8 +375,7 @@ done: int xfs_qm_dqattach( - struct xfs_inode *ip, - uint flags) + struct xfs_inode *ip) { int error; @@ -390,7 +383,7 @@ xfs_qm_dqattach( return 0; xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_qm_dqattach_locked(ip, flags); + error = xfs_qm_dqattach_locked(ip, false); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -479,11 +472,8 @@ xfs_qm_dquot_isolate( spin_unlock(lru_lock); error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(dqp->q_mount, "%s: dquot "PTR_FMT" flush failed", - __func__, dqp); + if (error) goto out_unlock_dirty; - } xfs_buf_delwri_queue(bp, &isol->buffers); xfs_buf_relse(bp); @@ -571,27 +561,88 @@ xfs_qm_set_defquota( { xfs_dquot_t *dqp; struct xfs_def_quota *defq; + struct xfs_disk_dquot *ddqp; int error; - error = xfs_qm_dqread(mp, 0, type, 0, &dqp); + error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); + if (error) + return; - if (!error) { - xfs_disk_dquot_t *ddqp = &dqp->q_core; + ddqp = &dqp->q_core; + defq = xfs_get_defquota(dqp, qinf); - defq = xfs_get_defquota(dqp, qinf); + /* + * Timers and warnings have been already set, let's just set the + * default limits for this quota type + */ + defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); + defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); + defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + xfs_qm_dqdestroy(dqp); +} - /* - * Timers and warnings have been already set, let's just set the - * default limits for this quota type - */ - defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); - defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); - defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); - defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); - defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); - defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - xfs_qm_dqdestroy(dqp); - } +/* Initialize quota time limits from the root dquot. */ +static void +xfs_qm_init_timelimits( + struct xfs_mount *mp, + struct xfs_quotainfo *qinf) +{ + struct xfs_disk_dquot *ddqp; + struct xfs_dquot *dqp; + uint type; + int error; + + qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + + /* + * We try to get the limits from the superuser's limits fields. + * This is quite hacky, but it is standard quota practice. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. + * + * Timers and warnings are globally set by the first timer found in + * user/group/proj quota types, otherwise a default value is used. + * This should be split into different fields per quota type. + */ + if (XFS_IS_UQUOTA_RUNNING(mp)) + type = XFS_DQ_USER; + else if (XFS_IS_GQUOTA_RUNNING(mp)) + type = XFS_DQ_GROUP; + else + type = XFS_DQ_PROJ; + error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); + if (error) + return; + + ddqp = &dqp->q_core; + /* + * The warnings and timers set the grace period given to + * a user or group before he or she can not perform any + * more writing. If it is zero, a default is used. + */ + if (ddqp->d_btimer) + qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer); + if (ddqp->d_itimer) + qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer); + if (ddqp->d_rtbtimer) + qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); + if (ddqp->d_bwarns) + qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns); + if (ddqp->d_iwarns) + qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns); + if (ddqp->d_rtbwarns) + qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); + + xfs_qm_dqdestroy(dqp); } /* @@ -600,11 +651,10 @@ xfs_qm_set_defquota( */ STATIC int xfs_qm_init_quotainfo( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_quotainfo_t *qinf; - int error; - xfs_dquot_t *dqp; + struct xfs_quotainfo *qinf; + int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -636,52 +686,7 @@ xfs_qm_init_quotainfo( mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - /* - * We try to get the limits from the superuser's limits fields. - * This is quite hacky, but it is standard quota practice. - * - * Since we may not have done a quotacheck by this point, just read - * the dquot without attaching it to any hashtables or lists. - * - * Timers and warnings are globally set by the first timer found in - * user/group/proj quota types, otherwise a default value is used. - * This should be split into different fields per quota type. - */ - error = xfs_qm_dqread(mp, 0, - XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : - (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : - XFS_DQ_PROJ), - 0, &dqp); - - if (!error) { - xfs_disk_dquot_t *ddqp = &dqp->q_core; - - /* - * The warnings and timers set the grace period given to - * a user or group before he or she can not perform any - * more writing. If it is zero, a default is used. - */ - qinf->qi_btimelimit = ddqp->d_btimer ? - be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = ddqp->d_itimer ? - be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ? - be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = ddqp->d_bwarns ? - be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = ddqp->d_iwarns ? - be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? - be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; - xfs_qm_dqdestroy(dqp); - } else { - qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; - } + xfs_qm_init_timelimits(mp, qinf); if (XFS_IS_UQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); @@ -865,9 +870,9 @@ xfs_qm_reset_dqcounts( * find uninitialised dquot blks. See comment in * xfs_dquot_verify. */ - fa = xfs_dquot_verify(mp, ddq, id + j, type, 0); + fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); if (fa) - xfs_dquot_repair(mp, ddq, id + j, type); + xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* * Reset type in case we are reusing group quota file for @@ -893,7 +898,7 @@ xfs_qm_reset_dqcounts( } STATIC int -xfs_qm_dqiter_bufs( +xfs_qm_reset_dqcounts_all( struct xfs_mount *mp, xfs_dqid_t firstid, xfs_fsblock_t bno, @@ -961,11 +966,11 @@ xfs_qm_dqiter_bufs( } /* - * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a - * caller supplied function for every chunk of dquots that we find. + * Iterate over all allocated dquot blocks in this quota inode, zeroing all + * counters for every chunk of dquots that we find. */ STATIC int -xfs_qm_dqiterate( +xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, uint flags, @@ -1041,7 +1046,7 @@ xfs_qm_dqiterate( * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ - error = xfs_qm_dqiter_bufs(mp, firstid, + error = xfs_qm_reset_dqcounts_all(mp, firstid, map[i].br_startblock, map[i].br_blockcount, flags, buffer_list); @@ -1066,16 +1071,17 @@ out: STATIC int xfs_qm_quotacheck_dqadjust( struct xfs_inode *ip, - xfs_dqid_t id, uint type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; + xfs_dqid_t id; int error; - error = xfs_qm_dqget(mp, ip, id, type, XFS_QMOPT_DQALLOC, &dqp); + id = xfs_qm_id_for_quotatype(ip, type); + error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { /* * Shouldn't be able to turn off quotas here. @@ -1148,13 +1154,10 @@ xfs_qm_dqusage_adjust( } /* - * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget - * interface expects the inode to be exclusively locked because that's - * the case in all other instances. It's OK that we do this because - * quotacheck is done only at mount time. + * We don't _need_ to take the ilock EXCL here because quotacheck runs + * at mount time and therefore nobody will be racing chown/chproj. */ - error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL, - &ip); + error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip); if (error) { *res = BULKSTAT_RV_NOTHING; return error; @@ -1189,33 +1192,31 @@ xfs_qm_dqusage_adjust( * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid, - XFS_DQ_USER, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, + rtblks); if (error) goto error0; } if (XFS_IS_GQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid, - XFS_DQ_GROUP, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, + rtblks); if (error) goto error0; } if (XFS_IS_PQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip), - XFS_DQ_PROJ, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, + rtblks); if (error) goto error0; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); IRELE(ip); *res = BULKSTAT_RV_DIDONE; return 0; error0: - xfs_iunlock(ip, XFS_ILOCK_EXCL); IRELE(ip); *res = BULKSTAT_RV_GIVEUP; return error; @@ -1247,9 +1248,8 @@ xfs_qm_flush_one( */ if (!xfs_dqflock_nowait(dqp)) { /* buf is pinned in-core by delwri list */ - DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen); - bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); + bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { error = -EINVAL; goto out_unlock; @@ -1307,7 +1307,7 @@ xfs_qm_quotacheck( * We don't log our changes till later. */ if (uip) { - error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, &buffer_list); if (error) goto error_return; @@ -1315,7 +1315,7 @@ xfs_qm_quotacheck( } if (gip) { - error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, &buffer_list); if (error) goto error_return; @@ -1323,7 +1323,7 @@ xfs_qm_quotacheck( } if (pip) { - error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, &buffer_list); if (error) goto error_return; @@ -1675,7 +1675,7 @@ xfs_qm_vop_dqalloc( * if necessary. The dquot(s) will not be locked. */ if (XFS_NOT_DQATTACHED(mp, ip)) { - error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC); + error = xfs_qm_dqattach_locked(ip, true); if (error) { xfs_iunlock(ip, lockflags); return error; @@ -1694,10 +1694,7 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, uid, - XFS_DQ_USER, - XFS_QMOPT_DQALLOC, - &uq); + error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1720,10 +1717,7 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, gid, - XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC, - &gq); + error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1739,10 +1733,8 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, - XFS_DQ_PROJ, - XFS_QMOPT_DQALLOC, - &pq); + error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, + true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1933,7 +1925,7 @@ xfs_qm_vop_rename_dqattach( */ if (i == 0 || ip != i_tab[i-1]) { if (XFS_NOT_DQATTACHED(mp, ip)) { - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 2975a822e9f0..e3129b280423 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -170,8 +170,10 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); -extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *, - uint, struct qc_dqblk *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct qc_dqblk *); +extern int xfs_qm_scall_getquota_next(struct xfs_mount *, + xfs_dqid_t *, uint, struct qc_dqblk *); extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, struct qc_dqblk *); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 2be6d2735ca9..36b89e2c5eb9 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -72,7 +72,7 @@ xfs_qm_statvfs( xfs_mount_t *mp = ip->i_mount; xfs_dquot_t *dqp; - if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { + if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9cb5c381b01c..3e05d300b14e 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -425,7 +425,7 @@ xfs_qm_scall_setqlim( * a reference to the dquot, so it's safe to do this unlock/lock without * it being reclaimed in the mean time. */ - error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { ASSERT(error != -ENOENT); goto out_unlock; @@ -622,39 +622,14 @@ out: return error; } - -int -xfs_qm_scall_getquota( +/* Fill out the quota context. */ +static void +xfs_qm_scall_getquota_fill_qc( struct xfs_mount *mp, - xfs_dqid_t *id, uint type, - struct qc_dqblk *dst, - uint dqget_flags) + const struct xfs_dquot *dqp, + struct qc_dqblk *dst) { - struct xfs_dquot *dqp; - int error; - - /* - * Try to get the dquot. We don't want it allocated on disk, so - * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't - * exist, we'll get ENOENT back. - */ - error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp); - if (error) - return error; - - /* - * If everything's NULL, this dquot doesn't quite exist as far as - * our utility programs are concerned. - */ - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - error = -ENOENT; - goto out_put; - } - - /* Fill in the ID we actually read from disk */ - *id = be32_to_cpu(dqp->q_core.d_id); - memset(dst, 0, sizeof(*dst)); dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); @@ -696,7 +671,7 @@ xfs_qm_scall_getquota( if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && - *id != 0) { + dqp->q_core.d_id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); @@ -707,11 +682,69 @@ xfs_qm_scall_getquota( } } #endif +} + +/* Return the quota information for the dquot matching id. */ +int +xfs_qm_scall_getquota( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct qc_dqblk *dst) +{ + struct xfs_dquot *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so don't + * set doalloc. If it doesn't exist, we'll get ENOENT back. + */ + error = xfs_qm_dqget(mp, id, type, false, &dqp); + if (error) + return error; + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + error = -ENOENT; + goto out_put; + } + + xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + out_put: xfs_qm_dqput(dqp); return error; } +/* + * Return the quota information for the first initialized dquot whose id + * is at least as high as id. + */ +int +xfs_qm_scall_getquota_next( + struct xfs_mount *mp, + xfs_dqid_t *id, + uint type, + struct qc_dqblk *dst) +{ + struct xfs_dquot *dqp; + int error; + + error = xfs_qm_dqget_next(mp, *id, type, &dqp); + if (error) + return error; + + /* Fill in the ID we actually read from disk */ + *id = be32_to_cpu(dqp->q_core.d_id); + + xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + + xfs_qm_dqput(dqp); + return error; +} STATIC int xfs_dqrele_inode( diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index ce6506adab7b..3edf52b14919 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -48,6 +48,22 @@ struct xfs_trans; (XFS_IS_PQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) +static inline uint +xfs_quota_chkd_flag( + uint dqtype) +{ + switch (dqtype) { + case XFS_DQ_USER: + return XFS_UQUOTA_CHKD; + case XFS_DQ_GROUP: + return XFS_GQUOTA_CHKD; + case XFS_DQ_PROJ: + return XFS_PQUOTA_CHKD; + default: + return 0; + } +} + /* * The structure kept inside the xfs_trans_t keep track of dquot changes * within a transaction and apply them later. @@ -90,8 +106,8 @@ extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, uint); -extern int xfs_qm_dqattach(struct xfs_inode *, uint); -extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); +extern int xfs_qm_dqattach(struct xfs_inode *); +extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc); extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); @@ -132,7 +148,7 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) #define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) -#define xfs_qm_dqattach(ip, fl) (0) +#define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index a65108594a07..c93fc913dffb 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -239,8 +239,7 @@ xfs_fs_get_dqblk( return -ESRCH; id = from_kqid(&init_user_ns, qid); - return xfs_qm_scall_getquota(mp, &id, - xfs_quota_type(qid.type), qdq, 0); + return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq); } /* Return quota info for active quota >= this qid */ @@ -260,9 +259,8 @@ xfs_fs_get_nextdqblk( return -ESRCH; id = from_kqid(&init_user_ns, *qid); - ret = xfs_qm_scall_getquota(mp, &id, - xfs_quota_type(qid->type), qdq, - XFS_QMOPT_DQNEXT); + ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type), + qdq); if (ret) return ret; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 15c9393dd7a7..e5866b714d5f 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -159,7 +159,7 @@ STATIC void xfs_cui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_cui_release(CUI_ITEM(lip)); } @@ -310,7 +310,7 @@ xfs_cud_item_unlock( { struct xfs_cud_log_item *cudp = CUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_cui_release(cudp->cud_cuip); kmem_zone_free(xfs_cud_zone, cudp); } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cdbd342a5249..713e857d9ffa 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -305,7 +305,7 @@ xfs_reflink_reserve_cow( * Fork all the shared blocks from our write offset until the end of * the extent. */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) return error; @@ -431,7 +431,7 @@ retry: if (error) return error; - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out; goto retry; @@ -552,6 +552,9 @@ xfs_reflink_trim_irec_to_next_cow( * * If cancel_real is true this function cancels all COW fork extents for the * inode; if cancel_real is false, real extents are not cleared. + * + * Caller must have already joined the inode to the current transaction. The + * inode will be joined to the transaction returned to the caller. */ int xfs_reflink_cancel_cow_blocks( @@ -592,7 +595,6 @@ xfs_reflink_cancel_cow_blocks( if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { - xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); /* Free the CoW orphan record. */ @@ -1359,7 +1361,7 @@ xfs_reflink_remap_range( goto out_unlock; /* Attach dquots to dest inode before changing block map */ - ret = xfs_qm_dqattach(dest, 0); + ret = xfs_qm_dqattach(dest); if (ret) goto out_unlock; @@ -1551,7 +1553,12 @@ next: return 0; } -/* Clear the inode reflink flag if there are no shared extents. */ +/* + * Clear the inode reflink flag if there are no shared extents. + * + * The caller is responsible for joining the inode to the transaction passed in. + * The inode will be joined to the transaction that is returned to the caller. + */ int xfs_reflink_clear_inode_flag( struct xfs_inode *ip, @@ -1578,7 +1585,6 @@ xfs_reflink_clear_inode_flag( trace_xfs_reflink_unset_inode_flag(ip); ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); - xfs_trans_ijoin(*tpp, ip, 0); xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); return error; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 06a07846c9b3..e5b5b3e7ef82 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -158,7 +158,7 @@ STATIC void xfs_rui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_rui_release(RUI_ITEM(lip)); } @@ -331,7 +331,7 @@ xfs_rud_item_unlock( { struct xfs_rud_log_item *rudp = RUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_rui_release(rudp->rud_ruip); kmem_zone_free(xfs_rud_zone, rudp); } diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index dfee3c991155..52632ab727f7 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -23,9 +23,14 @@ struct xfs_mount; struct xfs_trans; +/* + * XXX: Most of the realtime allocation functions deal in units of realtime + * extents, not realtime blocks. This looks funny when paired with the type + * name and screams for a larger cleanup. + */ struct xfs_rtalloc_rec { - xfs_rtblock_t ar_startblock; - xfs_rtblock_t ar_blockcount; + xfs_rtblock_t ar_startext; + xfs_rtblock_t ar_extcount; }; typedef int (*xfs_rtalloc_query_range_fn)( diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f643d76db516..ed67389f4948 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1372,7 +1372,6 @@ xfs_fs_remount( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_queue_eofblocks(mp); /* Recover any CoW blocks that never got remapped. */ error = xfs_reflink_recover_cow(mp); @@ -1382,7 +1381,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } - xfs_queue_cowblocks(mp); + xfs_icache_enable_reclaim(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1392,8 +1391,13 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { + /* + * Cancel background eofb scanning so it cannot race with the + * final log force+buftarg wait and deadlock the remount. + */ + xfs_icache_disable_reclaim(mp); + /* Get rid of any leftover CoW reservations... */ - cancel_delayed_work_sync(&mp->m_cowblocks_work); error = xfs_icache_free_cowblocks(mp, NULL); if (error) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -1416,12 +1420,6 @@ xfs_fs_remount( */ xfs_save_resvblks(mp); - /* - * Cancel background eofb scanning so it cannot race with the - * final log force+buftarg wait and deadlock the remount. - */ - cancel_delayed_work_sync(&mp->m_eofblocks_work); - xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1441,6 +1439,7 @@ xfs_fs_freeze( { struct xfs_mount *mp = XFS_M(sb); + xfs_icache_disable_reclaim(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); return xfs_sync_sb(mp, true); @@ -1454,6 +1453,7 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); + xfs_icache_enable_reclaim(mp); return 0; } @@ -1635,6 +1635,17 @@ xfs_fs_fill_super( #endif sb->s_op = &xfs_super_operations; + /* + * Delay mount work if the debug hook is set. This is debug + * instrumention to coordinate simulation of xfs mount failures with + * VFS superblock operations + */ + if (xfs_globals.mount_delay) { + xfs_notice(mp, "Delaying mount for %d seconds.", + xfs_globals.mount_delay); + msleep(xfs_globals.mount_delay * 1000); + } + if (silent) flags |= XFS_MFSI_QUIET; @@ -1690,11 +1701,17 @@ xfs_fs_fill_super( sb->s_flags |= SB_I_VERSION; if (mp->m_flags & XFS_MOUNT_DAX) { + bool rtdev_is_dax = false, datadev_is_dax; + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - error = bdev_dax_supported(sb, sb->s_blocksize); - if (error) { + datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, + sb->s_blocksize); + if (mp->m_rtdev_targp) + rtdev_is_dax = bdev_dax_supported( + mp->m_rtdev_targp->bt_bdev, sb->s_blocksize); + if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; @@ -1761,6 +1778,7 @@ xfs_fs_fill_super( out_close_devices: xfs_close_devices(mp); out_free_fsname: + sb->s_fs_info = NULL; xfs_free_fsname(mp); kfree(mp); out: @@ -1778,6 +1796,10 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); + /* if ->fill_super failed, we have no mount to tear down */ + if (!sb->s_fs_info) + return; + xfs_notice(mp, "Unmounting Filesystem"); xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -1787,6 +1809,8 @@ xfs_fs_put_super( xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); + + sb->s_fs_info = NULL; xfs_free_fsname(mp); kfree(mp); } @@ -1806,6 +1830,9 @@ xfs_fs_nr_cached_objects( struct super_block *sb, struct shrink_control *sc) { + /* Paranoia: catch incorrect calls during mount setup or teardown */ + if (WARN_ON_ONCE(!sb->s_fs_info)) + return 0; return xfs_reclaim_inodes_count(XFS_M(sb)); } @@ -1879,11 +1906,6 @@ xfs_init_zones(void) if (!xfs_trans_zone) goto out_destroy_ifork_zone; - xfs_log_item_desc_zone = - kmem_zone_init(sizeof(struct xfs_log_item_desc), - "xfs_log_item_desc"); - if (!xfs_log_item_desc_zone) - goto out_destroy_trans_zone; /* * The size of the zone allocated buf log item is the maximum @@ -1893,7 +1915,7 @@ xfs_init_zones(void) xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), "xfs_buf_item"); if (!xfs_buf_item_zone) - goto out_destroy_log_item_desc_zone; + goto out_destroy_trans_zone; xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * @@ -1981,8 +2003,6 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_efd_zone); out_destroy_buf_item_zone: kmem_zone_destroy(xfs_buf_item_zone); - out_destroy_log_item_desc_zone: - kmem_zone_destroy(xfs_log_item_desc_zone); out_destroy_trans_zone: kmem_zone_destroy(xfs_trans_zone); out_destroy_ifork_zone: @@ -2021,7 +2041,6 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_efd_zone); kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_log_item_desc_zone); kmem_zone_destroy(xfs_trans_zone); kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_da_state_zone); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5b66ac12913c..aed03da637d4 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -259,6 +259,7 @@ xfs_symlink( * bmapi or the directory create code. */ xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Allocate an inode for the symlink. @@ -488,16 +489,11 @@ xfs_inactive_symlink_rmt( error = xfs_defer_finish(&tp, &dfops); if (error) goto error_bmap_cancel; - /* - * The first xact was committed, so add the inode to the new one. - * Mark it dirty so it will be logged and moved forward in the log as - * part of every commit. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* * Commit the transaction containing extent freeing and EFDs. */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 82afee005140..b53a33e69932 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -95,6 +95,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ + int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 8b2ccc234f36..2d5cd2529f8e 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -165,9 +165,40 @@ log_recovery_delay_show( } XFS_SYSFS_ATTR_RW(log_recovery_delay); +STATIC ssize_t +mount_delay_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 60) + return -EINVAL; + + xfs_globals.mount_delay = val; + + return count; +} + +STATIC ssize_t +mount_delay_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay); +} +XFS_SYSFS_ATTR_RW(mount_delay); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), + ATTR_LIST(mount_delay), NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8955254b900e..9d4c4ca24fe6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -441,8 +441,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __field(unsigned, bli_recur) __field(int, bli_refcount) __field(unsigned, bli_flags) - __field(void *, li_desc) - __field(unsigned, li_flags) + __field(unsigned long, li_flags) ), TP_fast_assign( __entry->dev = bip->bli_buf->b_target->bt_dev; @@ -455,12 +454,11 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); __entry->buf_lockval = bip->bli_buf->b_sema.count; - __entry->li_desc = bip->bli_item.li_desc; __entry->li_flags = bip->bli_item.li_flags; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " "lock %d flags %s recur %d refcount %d bliflags %s " - "lidesc %p liflags %s", + "liflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->buf_bno, __entry->buf_len, @@ -471,7 +469,6 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_recur, __entry->bli_refcount, __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), - __entry->li_desc, __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) ) @@ -1018,7 +1015,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __field(dev_t, dev) __field(void *, lip) __field(uint, type) - __field(uint, flags) + __field(unsigned long, flags) __field(xfs_lsn_t, lsn) ), TP_fast_assign( @@ -1070,7 +1067,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __field(dev_t, dev) __field(void *, lip) __field(uint, type) - __field(uint, flags) + __field(unsigned long, flags) __field(xfs_lsn_t, old_lsn) __field(xfs_lsn_t, new_lsn) ), @@ -1750,6 +1747,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, namelen) __field(int, valuelen) __field(xfs_dahash_t, hashval) + __field(int, flags) __field(int, op_flags) ), TP_fast_assign( @@ -1760,10 +1758,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen = args->namelen; __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; + __entry->flags = args->flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x op_flags %s", + "hashval 0x%x flags %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1771,6 +1770,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen, __entry->valuelen, __entry->hashval, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -2243,30 +2243,35 @@ struct xfs_defer_pending; struct xfs_defer_ops; DECLARE_EVENT_CLASS(xfs_defer_class, - TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), - TP_ARGS(mp, dop), + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, + unsigned long caller_ip), + TP_ARGS(mp, dop, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) __field(char, committed) __field(char, low) + __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; __entry->dop = dop; __entry->committed = dop->dop_committed; __entry->low = dop->dop_low; + __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ops %p committed %d low %d", + TP_printk("dev %d:%d ops %p committed %d low %d, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, - __entry->low) + __entry->low, + (char *)__entry->caller_ip) ) #define DEFINE_DEFER_EVENT(name) \ DEFINE_EVENT(xfs_defer_class, name, \ - TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \ - TP_ARGS(mp, dop)) + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, \ + unsigned long caller_ip), \ + TP_ARGS(mp, dop, caller_ip)) DECLARE_EVENT_CLASS(xfs_defer_error_class, TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), @@ -2433,6 +2438,8 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, @@ -3346,6 +3353,43 @@ TRACE_EVENT(xfs_trans_resv_calc, __entry->logflags) ); +DECLARE_EVENT_CLASS(xfs_trans_class, + TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), + TP_ARGS(tp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint32_t, tid) + __field(uint32_t, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->tid = 0; + if (tp->t_ticket) + __entry->tid = tp->t_ticket->t_tid; + __entry->flags = tp->t_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d trans %x flags 0x%x caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->flags, + (char *)__entry->caller_ip) +) + +#define DEFINE_TRANS_EVENT(name) \ +DEFINE_EVENT(xfs_trans_class, name, \ + TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \ + TP_ARGS(tp, caller_ip)) +DEFINE_TRANS_EVENT(xfs_trans_alloc); +DEFINE_TRANS_EVENT(xfs_trans_cancel); +DEFINE_TRANS_EVENT(xfs_trans_commit); +DEFINE_TRANS_EVENT(xfs_trans_dup); +DEFINE_TRANS_EVENT(xfs_trans_free); +DEFINE_TRANS_EVENT(xfs_trans_roll); +DEFINE_TRANS_EVENT(xfs_trans_add_item); +DEFINE_TRANS_EVENT(xfs_trans_free_items); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index d6d8f9d129a7..fc7ba75b8b69 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -31,9 +31,9 @@ #include "xfs_log.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_defer.h" kmem_zone_t *xfs_trans_zone; -kmem_zone_t *xfs_log_item_desc_zone; #if defined(CONFIG_TRACEPOINTS) static void @@ -79,6 +79,7 @@ xfs_trans_free( xfs_extent_busy_sort(&tp->t_busy); xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); + trace_xfs_trans_free(tp, _RET_IP_); atomic_dec(&tp->t_mountp->m_active_trans); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); @@ -94,11 +95,13 @@ xfs_trans_free( * blocks. Locks and log items, however, are no inherited. They must * be added to the new transaction explicitly. */ -STATIC xfs_trans_t * +STATIC struct xfs_trans * xfs_trans_dup( - xfs_trans_t *tp) + struct xfs_trans *tp) { - xfs_trans_t *ntp; + struct xfs_trans *ntp; + + trace_xfs_trans_dup(tp, _RET_IP_); ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); @@ -127,6 +130,7 @@ xfs_trans_dup( ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; ntp->t_pflags = tp->t_pflags; + ntp->t_agfl_dfops = tp->t_agfl_dfops; xfs_trans_dup_dqinfo(tp, ntp); @@ -283,6 +287,8 @@ xfs_trans_alloc( return error; } + trace_xfs_trans_alloc(tp, _RET_IP_); + *tpp = tp; return 0; } @@ -727,73 +733,52 @@ out: return; } -/* - * Add the given log item to the transaction's list of log items. - * - * The log item will now point to its new descriptor with its li_desc field. - */ +/* Add the given log item to the transaction's list of log items. */ void xfs_trans_add_item( struct xfs_trans *tp, struct xfs_log_item *lip) { - struct xfs_log_item_desc *lidp; - ASSERT(lip->li_mountp == tp->t_mountp); ASSERT(lip->li_ailp == tp->t_mountp->m_ail); + ASSERT(list_empty(&lip->li_trans)); + ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags)); - lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); - - lidp->lid_item = lip; - lidp->lid_flags = 0; - list_add_tail(&lidp->lid_trans, &tp->t_items); - - lip->li_desc = lidp; -} - -STATIC void -xfs_trans_free_item_desc( - struct xfs_log_item_desc *lidp) -{ - list_del_init(&lidp->lid_trans); - kmem_zone_free(xfs_log_item_desc_zone, lidp); + list_add_tail(&lip->li_trans, &tp->t_items); + trace_xfs_trans_add_item(tp, _RET_IP_); } /* - * Unlink and free the given descriptor. + * Unlink the log item from the transaction. the log item is no longer + * considered dirty in this transaction, as the linked transaction has + * finished, either by abort or commit completion. */ void xfs_trans_del_item( struct xfs_log_item *lip) { - xfs_trans_free_item_desc(lip->li_desc); - lip->li_desc = NULL; + clear_bit(XFS_LI_DIRTY, &lip->li_flags); + list_del_init(&lip->li_trans); } -/* - * Unlock all of the items of a transaction and free all the descriptors - * of that transaction. - */ +/* Detach and unlock all of the items in a transaction */ void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, bool abort) { - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_item *lip, *next; - lip->li_desc = NULL; + trace_xfs_trans_free_items(tp, _RET_IP_); + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + xfs_trans_del_item(lip); if (commit_lsn != NULLCOMMITLSN) lip->li_ops->iop_committing(lip, commit_lsn); if (abort) - lip->li_flags |= XFS_LI_ABORTED; + set_bit(XFS_LI_ABORTED, &lip->li_flags); lip->li_ops->iop_unlock(lip); - - xfs_trans_free_item_desc(lidp); } } @@ -861,7 +846,7 @@ xfs_trans_committed_bulk( xfs_lsn_t item_lsn; if (aborted) - lip->li_flags |= XFS_LI_ABORTED; + set_bit(XFS_LI_ABORTED, &lip->li_flags); item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); /* item_lsn of -1 means the item needs no further processing */ @@ -936,6 +921,11 @@ __xfs_trans_commit( int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; + ASSERT(!tp->t_agfl_dfops || + !xfs_defer_has_unfinished_work(tp->t_agfl_dfops) || regrant); + + trace_xfs_trans_commit(tp, _RET_IP_); + /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the @@ -991,6 +981,7 @@ out_unreserve: commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); if (commit_lsn == -1 && !error) error = -EIO; + tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); @@ -1022,6 +1013,8 @@ xfs_trans_cancel( struct xfs_mount *mp = tp->t_mountp; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); + trace_xfs_trans_cancel(tp, _RET_IP_); + /* * See if the caller is relying on us to shut down the * filesystem. This happens in paths where we detect @@ -1033,17 +1026,19 @@ xfs_trans_cancel( } #ifdef DEBUG if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; - list_for_each_entry(lidp, &tp->t_items, lid_trans) - ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); + list_for_each_entry(lip, &tp->t_items, li_trans) + ASSERT(!(lip->li_type == XFS_LI_EFD)); } #endif xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) + if (tp->t_ticket) { xfs_log_done(mp, tp->t_ticket, NULL, false); + tp->t_ticket = NULL; + } /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1067,6 +1062,8 @@ xfs_trans_roll( struct xfs_trans_res tres; int error; + trace_xfs_trans_roll(trans, _RET_IP_); + /* * Copy the critical parameters from one trans to the next. */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9d542dfe0052..29706b8b3bd4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -27,7 +27,6 @@ struct xfs_efi_log_item; struct xfs_inode; struct xfs_item_ops; struct xfs_log_iovec; -struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_trans_res; @@ -43,12 +42,12 @@ struct xfs_bud_log_item; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ + struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ - struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ struct xfs_mount *li_mountp; /* ptr to fs mount */ struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ - uint li_flags; /* misc flags */ + unsigned long li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ struct list_head li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, @@ -64,14 +63,21 @@ typedef struct xfs_log_item { xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; -#define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 -#define XFS_LI_FAILED 0x4 +/* + * li_flags use the (set/test/clear)_bit atomic interfaces because updates can + * race with each other and we don't want to have to use the AIL lock to + * serialise all updates. + */ +#define XFS_LI_IN_AIL 0 +#define XFS_LI_ABORTED 1 +#define XFS_LI_FAILED 2 +#define XFS_LI_DIRTY 3 /* log item dirty in transaction */ #define XFS_LI_FLAGS \ - { XFS_LI_IN_AIL, "IN_AIL" }, \ - { XFS_LI_ABORTED, "ABORTED" }, \ - { XFS_LI_FAILED, "FAILED" } + { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ + { (1 << XFS_LI_ABORTED), "ABORTED" }, \ + { (1 << XFS_LI_FAILED), "FAILED" }, \ + { (1 << XFS_LI_DIRTY), "DIRTY" } struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); @@ -111,6 +117,7 @@ typedef struct xfs_trans { struct xlog_ticket *t_ticket; /* log mgr ticket */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ + struct xfs_defer_ops *t_agfl_dfops; /* optional agfl fixup dfops */ unsigned int t_flags; /* misc flags */ int64_t t_icount_delta; /* superblock icount change */ int64_t t_ifree_delta; /* superblock ifree change */ @@ -228,7 +235,8 @@ struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *, uint); int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, - xfs_extlen_t, struct xfs_owner_info *); + xfs_extlen_t, struct xfs_owner_info *, + bool); int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **); int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *); @@ -242,7 +250,6 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); extern kmem_zone_t *xfs_trans_zone; -extern kmem_zone_t *xfs_log_item_desc_zone; /* rmap updates */ enum xfs_rmap_intent_type; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d4a2445215e6..41e280ef1483 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -32,30 +32,51 @@ #ifdef DEBUG /* * Check that the list is sorted as it should be. + * + * Called with the ail lock held, but we don't want to assert fail with it + * held otherwise we'll lock everything up and won't be able to debug the + * cause. Hence we sample and check the state under the AIL lock and return if + * everything is fine, otherwise we drop the lock and run the ASSERT checks. + * Asserts may not be fatal, so pick the lock back up and continue onwards. */ STATIC void xfs_ail_check( - struct xfs_ail *ailp, - xfs_log_item_t *lip) + struct xfs_ail *ailp, + struct xfs_log_item *lip) { - xfs_log_item_t *prev_lip; + struct xfs_log_item *prev_lip; + struct xfs_log_item *next_lip; + xfs_lsn_t prev_lsn = NULLCOMMITLSN; + xfs_lsn_t next_lsn = NULLCOMMITLSN; + xfs_lsn_t lsn; + bool in_ail; + if (list_empty(&ailp->ail_head)) return; /* - * Check the next and previous entries are valid. + * Sample then check the next and previous entries are valid. */ - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags); + prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail); if (&prev_lip->li_ail != &ailp->ail_head) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - - prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->ail_head) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + prev_lsn = prev_lip->li_lsn; + next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail); + if (&next_lip->li_ail != &ailp->ail_head) + next_lsn = next_lip->li_lsn; + lsn = lip->li_lsn; + if (in_ail && + (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) && + (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0)) + return; + spin_unlock(&ailp->ail_lock); + ASSERT(in_ail); + ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0); + ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0); + spin_lock(&ailp->ail_lock); } #else /* !DEBUG */ #define xfs_ail_check(a,l) @@ -684,7 +705,7 @@ xfs_trans_ail_update_bulk( for (i = 0; i < nr_items; i++) { struct xfs_log_item *lip = log_items[i]; - if (lip->li_flags & XFS_LI_IN_AIL) { + if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) { /* check if we really need to move the item */ if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) continue; @@ -694,7 +715,6 @@ xfs_trans_ail_update_bulk( if (mlip == lip) mlip_changed = 1; } else { - lip->li_flags |= XFS_LI_IN_AIL; trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; @@ -725,7 +745,7 @@ xfs_ail_delete_one( trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); xfs_clear_li_failed(lip); - lip->li_flags &= ~XFS_LI_IN_AIL; + clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; return mlip == lip; @@ -761,7 +781,7 @@ xfs_trans_ail_delete( struct xfs_mount *mp = ailp->ail_mount; bool mlip_changed; - if (!(lip->li_flags & XFS_LI_IN_AIL)) { + if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 14543d93cd4b..230a21df4b12 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -79,7 +79,7 @@ xfs_trans_log_finish_bmap_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); return error; } @@ -158,7 +158,7 @@ xfs_bmap_update_log_item( bmap = container_of(item, struct xfs_bmap_intent, bi_list); tp->t_flags |= XFS_TRANS_DIRTY; - buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index a5d9dfc45d98..a8ddb4eed279 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -40,7 +40,7 @@ xfs_trans_buf_item_match( struct xfs_buf_map *map, int nmaps) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; struct xfs_buf_log_item *blip; int len = 0; int i; @@ -48,8 +48,8 @@ xfs_trans_buf_item_match( for (i = 0; i < nmaps; i++) len += map[i].bm_len; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - blip = (struct xfs_buf_log_item *)lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { + blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && @@ -100,14 +100,10 @@ _xfs_trans_bjoin( atomic_inc(&bip->bli_refcount); /* - * Get a log_item_desc to point at the new item. + * Attach the item to the transaction so we can find it in + * xfs_trans_get_buf() and friends. */ xfs_trans_add_item(tp, &bip->bli_item); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * in xfs_trans_get_buf() and friends above. - */ bp->b_transp = tp; } @@ -391,7 +387,7 @@ xfs_trans_brelse( * If the buffer is dirty within this transaction, we can't * release it until we commit. */ - if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) + if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; /* @@ -442,7 +438,7 @@ xfs_trans_brelse( ASSERT(bp->b_pincount == 0); ***/ ASSERT(atomic_read(&bip->bli_refcount) == 0); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); xfs_buf_item_relse(bp); } @@ -542,7 +538,7 @@ xfs_trans_dirty_buf( bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; - bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* @@ -626,7 +622,7 @@ xfs_trans_binval( ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); + ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; } @@ -642,7 +638,7 @@ xfs_trans_binval( memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); } - bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); tp->t_flags |= XFS_TRANS_DIRTY; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c3d547211d16..c381c02cca45 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -77,7 +77,7 @@ xfs_trans_log_dquot( ASSERT(XFS_DQ_IS_LOCKED(dqp)); tp->t_flags |= XFS_TRANS_DIRTY; - dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags); } /* @@ -879,7 +879,7 @@ xfs_trans_log_quotaoff_item( xfs_qoff_logitem_t *qlp) { tp->t_flags |= XFS_TRANS_DIRTY; - qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); } STATIC void diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index ab438647592a..2f44a08bdf65 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -68,7 +68,8 @@ xfs_trans_free_extent( struct xfs_efd_log_item *efdp, xfs_fsblock_t start_block, xfs_extlen_t ext_len, - struct xfs_owner_info *oinfo) + struct xfs_owner_info *oinfo, + bool skip_discard) { struct xfs_mount *mp = tp->t_mountp; uint next_extent; @@ -79,9 +80,8 @@ xfs_trans_free_extent( trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); - error = xfs_free_extent(tp, start_block, ext_len, oinfo, - XFS_AG_RESV_NONE); - + error = __xfs_free_extent(tp, start_block, ext_len, + oinfo, XFS_AG_RESV_NONE, skip_discard); /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -90,7 +90,7 @@ xfs_trans_free_extent( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); @@ -155,7 +155,7 @@ xfs_extent_free_log_item( free = container_of(item, struct xfs_extent_free_item, xefi_list); tp->t_flags |= XFS_TRANS_DIRTY; - efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); /* * atomic_inc_return gives us the value after the increment; @@ -195,7 +195,7 @@ xfs_extent_free_finish_item( error = xfs_trans_free_extent(tp, done_item, free->xefi_startblock, free->xefi_blockcount, - &free->xefi_oinfo); + &free->xefi_oinfo, free->xefi_skip_discard); kmem_free(free); return error; } @@ -231,9 +231,79 @@ static const struct xfs_defer_op_type xfs_extent_free_defer_type = { .cancel_item = xfs_extent_free_cancel_item, }; +/* + * AGFL blocks are accounted differently in the reserve pools and are not + * inserted into the busy extent list. + */ +STATIC int +xfs_agfl_free_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efd_log_item *efdp = done_item; + struct xfs_extent_free_item *free; + struct xfs_extent *extp; + struct xfs_buf *agbp; + int error; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + uint next_extent; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + ASSERT(free->xefi_blockcount == 1); + agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); + + trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (!error) + error = xfs_free_agfl_block(tp, agno, agbno, agbp, + &free->xefi_oinfo); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = free->xefi_startblock; + extp->ext_len = free->xefi_blockcount; + efdp->efd_next_extent++; + + kmem_free(free); + return error; +} + + +/* sub-type with special handling for AGFL deferred frees */ +static const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .type = XFS_DEFER_OPS_TYPE_AGFL_FREE, + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .diff_items = xfs_extent_free_diff_items, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .log_item = xfs_extent_free_log_item, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, +}; + /* Register the deferred op type. */ void xfs_extent_free_init_defer_op(void) { xfs_defer_init_op_type(&xfs_extent_free_defer_type); + xfs_defer_init_op_type(&xfs_agfl_free_defer_type); } diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 07cea592dc01..f7bd7960a90f 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -133,14 +133,13 @@ xfs_trans_log_inode( * set however, then go ahead and bump the i_version counter * unconditionally. */ - if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && IS_I_VERSION(VFS_I(ip))) { if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) flags |= XFS_ILOG_CORE; } tp->t_flags |= XFS_TRANS_DIRTY; - ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; /* * Always OR in the bits from the ili_last_fields field. diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index be24b0c8a332..9717ae74b36d 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -19,7 +19,6 @@ #define __XFS_TRANS_PRIV_H__ struct xfs_log_item; -struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_ail; @@ -119,7 +118,7 @@ xfs_trans_ail_remove( spin_lock(&ailp->ail_lock); /* xfs_trans_ail_delete() drops the AIL lock */ - if (lip->li_flags & XFS_LI_IN_AIL) + if (test_bit(XFS_LI_IN_AIL, &lip->li_flags)) xfs_trans_ail_delete(ailp, lip, shutdown_type); else spin_unlock(&ailp->ail_lock); @@ -171,11 +170,10 @@ xfs_clear_li_failed( { struct xfs_buf *bp = lip->li_buf; - ASSERT(lip->li_flags & XFS_LI_IN_AIL); + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); lockdep_assert_held(&lip->li_ailp->ail_lock); - if (lip->li_flags & XFS_LI_FAILED) { - lip->li_flags &= ~XFS_LI_FAILED; + if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { lip->li_buf = NULL; xfs_buf_rele(bp); } @@ -188,9 +186,8 @@ xfs_set_li_failed( { lockdep_assert_held(&lip->li_ailp->ail_lock); - if (!(lip->li_flags & XFS_LI_FAILED)) { + if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { xfs_buf_hold(bp); - lip->li_flags |= XFS_LI_FAILED; lip->li_buf = bp; } } diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c index 94c1877af834..c7f8e82f5bda 100644 --- a/fs/xfs/xfs_trans_refcount.c +++ b/fs/xfs/xfs_trans_refcount.c @@ -77,7 +77,7 @@ xfs_trans_log_finish_refcount_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); return error; } @@ -154,7 +154,7 @@ xfs_refcount_update_log_item( refc = container_of(item, struct xfs_refcount_intent, ri_list); tp->t_flags |= XFS_TRANS_DIRTY; - cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 9b577beb43d7..5831ca0c270b 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -117,7 +117,7 @@ xfs_trans_log_finish_rmap_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); return error; } @@ -175,7 +175,7 @@ xfs_rmap_update_log_item( rmap = container_of(item, struct xfs_rmap_intent, ri_list); tp->t_flags |= XFS_TRANS_DIRTY; - ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; |