summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c29
-rw-r--r--fs/Kconfig5
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/addr_list.c3
-rw-r--r--fs/afs/rotate.c4
-rw-r--r--fs/aio.c18
-rw-r--r--fs/autofs/Kconfig20
-rw-r--r--fs/autofs/Makefile7
-rw-r--r--fs/autofs/autofs_i.h (renamed from fs/autofs4/autofs_i.h)92
-rw-r--r--fs/autofs/dev-ioctl.c (renamed from fs/autofs4/dev-ioctl.c)31
-rw-r--r--fs/autofs/expire.c (renamed from fs/autofs4/expire.c)133
-rw-r--r--fs/autofs/init.c (renamed from fs/autofs4/init.c)12
-rw-r--r--fs/autofs/inode.c (renamed from fs/autofs4/inode.c)54
-rw-r--r--fs/autofs/root.c (renamed from fs/autofs4/root.c)277
-rw-r--r--fs/autofs/symlink.c (renamed from fs/autofs4/symlink.c)16
-rw-r--r--fs/autofs/waitq.c (renamed from fs/autofs4/waitq.c)59
-rw-r--r--fs/autofs4/Kconfig46
-rw-r--r--fs/autofs4/Makefile4
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/block_dev.c2
-rw-r--r--fs/buffer.c114
-rw-r--r--fs/compat.c72
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/crypto/crypto.c47
-rw-r--r--fs/crypto/fname.c32
-rw-r--r--fs/crypto/fscrypt_private.h23
-rw-r--r--fs/crypto/hooks.c5
-rw-r--r--fs/crypto/keyinfo.c286
-rw-r--r--fs/dax.c214
-rw-r--r--fs/debugfs/file.c10
-rw-r--r--fs/debugfs/inode.c4
-rw-r--r--fs/exec.c38
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext4/balloc.c33
-rw-r--r--fs/ext4/ext4.h7
-rw-r--r--fs/ext4/extents_status.c3
-rw-r--r--fs/ext4/file.c93
-rw-r--r--fs/ext4/fsmap.c4
-rw-r--r--fs/ext4/ialloc.c35
-rw-r--r--fs/ext4/indirect.c14
-rw-r--r--fs/ext4/inline.c10
-rw-r--r--fs/ext4/inode.c77
-rw-r--r--fs/ext4/mballoc.c23
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c79
-rw-r--r--fs/ext4/xattr.c2
-rw-r--r--fs/ext4/xattr_security.c2
-rw-r--r--fs/f2fs/super.c8
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fuse/acl.c4
-rw-r--r--fs/fuse/control.c15
-rw-r--r--fs/fuse/cuse.c11
-rw-r--r--fs/fuse/dev.c43
-rw-r--r--fs/fuse/dir.c45
-rw-r--r--fs/fuse/fuse_i.h15
-rw-r--r--fs/fuse/inode.c50
-rw-r--r--fs/fuse/xattr.c43
-rw-r--r--fs/gfs2/bmap.c11
-rw-r--r--fs/hpfs/hpfs_fn.h3
-rw-r--r--fs/inode.c164
-rw-r--r--fs/iomap.c409
-rw-r--r--fs/jbd2/journal.c20
-rw-r--r--fs/jbd2/revoke.c12
-rw-r--r--fs/jbd2/transaction.c6
-rw-r--r--fs/kernfs/file.c8
-rw-r--r--fs/ocfs2/dlmglue.c119
-rw-r--r--fs/ocfs2/dlmglue.h1
-rw-r--r--fs/ocfs2/file.c10
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/mmap.c44
-rw-r--r--fs/ocfs2/namei.c3
-rw-r--r--fs/ocfs2/ocfs2_fs.h8
-rw-r--r--fs/orangefs/devorangefs-req.c17
-rw-r--r--fs/orangefs/file.c4
-rw-r--r--fs/orangefs/inode.c57
-rw-r--r--fs/orangefs/namei.c7
-rw-r--r--fs/orangefs/orangefs-bufmap.c20
-rw-r--r--fs/orangefs/orangefs-debugfs.c2
-rw-r--r--fs/orangefs/orangefs-kernel.h1
-rw-r--r--fs/orangefs/orangefs-utils.c30
-rw-r--r--fs/orangefs/protocol.h2
-rw-r--r--fs/orangefs/super.c13
-rw-r--r--fs/orangefs/waitqueue.c14
-rw-r--r--fs/overlayfs/Kconfig6
-rw-r--r--fs/overlayfs/copy_up.c83
-rw-r--r--fs/overlayfs/dir.c208
-rw-r--r--fs/overlayfs/export.c8
-rw-r--r--fs/overlayfs/inode.c28
-rw-r--r--fs/overlayfs/namei.c10
-rw-r--r--fs/overlayfs/overlayfs.h65
-rw-r--r--fs/overlayfs/super.c9
-rw-r--r--fs/proc/Kconfig15
-rw-r--r--fs/proc/array.c35
-rw-r--r--fs/proc/base.c270
-rw-r--r--fs/proc/fd.c2
-rw-r--r--fs/proc/internal.h4
-rw-r--r--fs/proc/page.c2
-rw-r--r--fs/proc/task_mmu.c39
-rw-r--r--fs/proc/vmcore.c386
-rw-r--r--fs/ubifs/crypto.c10
-rw-r--r--fs/udf/Kconfig6
-rw-r--r--fs/udf/super.c12
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/udf/unicode.c260
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--fs/xfs/Kconfig18
-rw-r--r--fs/xfs/Makefile9
-rw-r--r--fs/xfs/libxfs/xfs_ag.c464
-rw-r--r--fs/xfs/libxfs/xfs_ag.h30
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c129
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h23
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_attr.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c92
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h30
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c9
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_btree.c25
-rw-r--r--fs/xfs/libxfs/xfs_btree.h3
-rw-r--r--fs/xfs/libxfs/xfs_defer.c24
-rw-r--r--fs/xfs/libxfs/xfs_defer.h1
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c94
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_fs.h9
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c11
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h9
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c24
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c83
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h4
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c26
-rw-r--r--fs/xfs/libxfs/xfs_sb.c147
-rw-r--r--fs/xfs/libxfs/xfs_sb.h16
-rw-r--r--fs/xfs/libxfs/xfs_shared.h16
-rw-r--r--fs/xfs/libxfs/xfs_types.h2
-rw-r--r--fs/xfs/scrub/agheader.c85
-rw-r--r--fs/xfs/scrub/agheader_repair.c70
-rw-r--r--fs/xfs/scrub/alloc.c4
-rw-r--r--fs/xfs/scrub/attr.c3
-rw-r--r--fs/xfs/scrub/bmap.c8
-rw-r--r--fs/xfs/scrub/btree.c42
-rw-r--r--fs/xfs/scrub/common.c131
-rw-r--r--fs/xfs/scrub/common.h32
-rw-r--r--fs/xfs/scrub/dir.c35
-rw-r--r--fs/xfs/scrub/ialloc.c7
-rw-r--r--fs/xfs/scrub/inode.c10
-rw-r--r--fs/xfs/scrub/parent.c19
-rw-r--r--fs/xfs/scrub/quota.c181
-rw-r--r--fs/xfs/scrub/refcount.c10
-rw-r--r--fs/xfs/scrub/repair.c1089
-rw-r--r--fs/xfs/scrub/repair.h132
-rw-r--r--fs/xfs/scrub/rmap.c6
-rw-r--r--fs/xfs/scrub/rtbitmap.c60
-rw-r--r--fs/xfs/scrub/scrub.c142
-rw-r--r--fs/xfs/scrub/scrub.h5
-rw-r--r--fs/xfs/scrub/trace.h258
-rw-r--r--fs/xfs/xfs_aops.c21
-rw-r--r--fs/xfs/xfs_bmap_item.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c10
-rw-r--r--fs/xfs/xfs_buf.c100
-rw-r--r--fs/xfs/xfs_buf.h29
-rw-r--r--fs/xfs/xfs_buf_item.c10
-rw-r--r--fs/xfs/xfs_dquot.c780
-rw-r--r--fs/xfs/xfs_dquot.h22
-rw-r--r--fs/xfs/xfs_dquot_item.c7
-rw-r--r--fs/xfs/xfs_error.c3
-rw-r--r--fs/xfs/xfs_extfree_item.c6
-rw-r--r--fs/xfs/xfs_file.c125
-rw-r--r--fs/xfs/xfs_fsmap.c14
-rw-r--r--fs/xfs/xfs_fsops.c587
-rw-r--r--fs/xfs/xfs_globals.c1
-rw-r--r--fs/xfs/xfs_icache.c94
-rw-r--r--fs/xfs/xfs_icache.h3
-rw-r--r--fs/xfs/xfs_icreate_item.c4
-rw-r--r--fs/xfs/xfs_inode.c47
-rw-r--r--fs/xfs/xfs_inode.h30
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_ioctl.c97
-rw-r--r--fs/xfs/xfs_iomap.c186
-rw-r--r--fs/xfs/xfs_iops.c48
-rw-r--r--fs/xfs/xfs_log.c12
-rw-r--r--fs/xfs/xfs_log_cil.c22
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_mount.c4
-rw-r--r--fs/xfs/xfs_pnfs.c15
-rw-r--r--fs/xfs/xfs_pnfs.h5
-rw-r--r--fs/xfs/xfs_qm.c254
-rw-r--r--fs/xfs/xfs_qm.h6
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c97
-rw-r--r--fs/xfs/xfs_quota.h22
-rw-r--r--fs/xfs/xfs_quotaops.c8
-rw-r--r--fs/xfs/xfs_refcount_item.c4
-rw-r--r--fs/xfs/xfs_reflink.c18
-rw-r--r--fs/xfs/xfs_rmap_item.c4
-rw-r--r--fs/xfs/xfs_rtalloc.h9
-rw-r--r--fs/xfs/xfs_super.c59
-rw-r--r--fs/xfs/xfs_symlink.c10
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_sysfs.c31
-rw-r--r--fs/xfs/xfs_trace.h72
-rw-r--r--fs/xfs/xfs_trans.c89
-rw-r--r--fs/xfs/xfs_trans.h29
-rw-r--r--fs/xfs/xfs_trans_ail.c50
-rw-r--r--fs/xfs/xfs_trans_bmap.c4
-rw-r--r--fs/xfs/xfs_trans_buf.c24
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_extfree.c84
-rw-r--r--fs/xfs/xfs_trans_inode.c3
-rw-r--r--fs/xfs/xfs_trans_priv.h11
-rw-r--r--fs/xfs/xfs_trans_refcount.c4
-rw-r--r--fs/xfs/xfs_trans_rmap.c4
219 files changed, 7986 insertions, 3654 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index e622f0f10502..0429c8ee58f1 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -210,12 +210,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
p9_debug(P9_DEBUG_ERROR,
"integer field, but no integer?\n");
ret = r;
- continue;
- }
- v9ses->debug = option;
+ } else {
+ v9ses->debug = option;
#ifdef CONFIG_NET_9P_DEBUG
- p9_debug_level = option;
+ p9_debug_level = option;
#endif
+ }
break;
case Opt_dfltuid:
@@ -231,7 +231,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
p9_debug(P9_DEBUG_ERROR,
"uid field, but not a uid?\n");
ret = -EINVAL;
- continue;
}
break;
case Opt_dfltgid:
@@ -247,7 +246,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
p9_debug(P9_DEBUG_ERROR,
"gid field, but not a gid?\n");
ret = -EINVAL;
- continue;
}
break;
case Opt_afid:
@@ -256,9 +254,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
p9_debug(P9_DEBUG_ERROR,
"integer field, but no integer?\n");
ret = r;
- continue;
+ } else {
+ v9ses->afid = option;
}
- v9ses->afid = option;
break;
case Opt_uname:
kfree(v9ses->uname);
@@ -306,13 +304,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
"problem allocating copy of cache arg\n");
goto free_and_return;
}
- ret = get_cache_mode(s);
- if (ret == -EINVAL) {
- kfree(s);
- goto free_and_return;
- }
+ r = get_cache_mode(s);
+ if (r < 0)
+ ret = r;
+ else
+ v9ses->cache = r;
- v9ses->cache = ret;
kfree(s);
break;
@@ -341,14 +338,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
pr_info("Unknown access argument %s\n",
s);
kfree(s);
- goto free_and_return;
+ continue;
}
v9ses->uid = make_kuid(current_user_ns(), uid);
if (!uid_valid(v9ses->uid)) {
ret = -EINVAL;
pr_info("Uknown uid %s\n", s);
- kfree(s);
- goto free_and_return;
}
}
diff --git a/fs/Kconfig b/fs/Kconfig
index ac4ac908f001..ab2d96d1abee 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -38,6 +38,7 @@ config FS_DAX
bool "Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
+ select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
select FS_IOMAP
select DAX
help
@@ -108,6 +109,7 @@ source "fs/notify/Kconfig"
source "fs/quota/Kconfig"
+source "fs/autofs/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
source "fs/overlayfs/Kconfig"
@@ -203,6 +205,9 @@ config HUGETLBFS
config HUGETLB_PAGE
def_bool HUGETLBFS
+config MEMFD_CREATE
+ def_bool TMPFS || HUGETLBFS
+
config ARCH_HAS_GIGANTIC_PAGE
bool
diff --git a/fs/Makefile b/fs/Makefile
index c9375fd2c8c4..2e005525cc19 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -102,6 +102,7 @@ obj-$(CONFIG_AFFS_FS) += affs/
obj-$(CONFIG_ROMFS_FS) += romfs/
obj-$(CONFIG_QNX4FS_FS) += qnx4/
obj-$(CONFIG_QNX6FS_FS) += qnx6/
+obj-$(CONFIG_AUTOFS_FS) += autofs/
obj-$(CONFIG_AUTOFS4_FS) += autofs4/
obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 7587fb665ff1..2c46c46f3a6d 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -43,8 +43,7 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
_enter("%u,%u,%u", nr, service, port);
- alist = kzalloc(sizeof(*alist) + sizeof(alist->addrs[0]) * nr,
- GFP_KERNEL);
+ alist = kzalloc(struct_size(alist, addrs, nr), GFP_KERNEL);
if (!alist)
return NULL;
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index e065bc0768e6..1faef56b12bd 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -310,6 +310,10 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
case -ETIME:
_debug("no conn");
goto iterate_address;
+
+ case -ECONNRESET:
+ _debug("call reset");
+ goto failed;
}
restart_from_beginning:
diff --git a/fs/aio.c b/fs/aio.c
index b850e92ee0d5..134e5b635d64 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1434,7 +1434,23 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
req->ki_flags = iocb_flags(req->ki_filp);
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
- req->ki_hint = file_write_hint(req->ki_filp);
+ req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
+ if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
+ /*
+ * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
+ * aio_reqprio is interpreted as an I/O scheduling
+ * class and priority.
+ */
+ ret = ioprio_check_cap(iocb->aio_reqprio);
+ if (ret) {
+ pr_debug("aio ioprio check cap error: %d\n", ret);
+ return ret;
+ }
+
+ req->ki_ioprio = iocb->aio_reqprio;
+ } else
+ req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
if (unlikely(ret))
fput(req->ki_filp);
diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig
new file mode 100644
index 000000000000..6a2064eb3b27
--- /dev/null
+++ b/fs/autofs/Kconfig
@@ -0,0 +1,20 @@
+config AUTOFS_FS
+ tristate "Kernel automounter support (supports v3, v4 and v5)"
+ default n
+ help
+ The automounter is a tool to automatically mount remote file systems
+ on demand. This implementation is partially kernel-based to reduce
+ overhead in the already-mounted case; this is unlike the BSD
+ automounter (amd), which is a pure user space daemon.
+
+ To use the automounter you need the user-space tools from
+ <https://www.kernel.org/pub/linux/daemons/autofs/>; you also want
+ to answer Y to "NFS file system support", below.
+
+ To compile this support as a module, choose M here: the module will be
+ called autofs.
+
+ If you are not a part of a fairly large, distributed network or
+ don't have a laptop which needs to dynamically reconfigure to the
+ local network, you probably do not need an automounter, and can say
+ N here.
diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile
new file mode 100644
index 000000000000..43fedde15c26
--- /dev/null
+++ b/fs/autofs/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the linux autofs-filesystem routines.
+#
+
+obj-$(CONFIG_AUTOFS_FS) += autofs.o
+
+autofs-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs/autofs_i.h
index 4737615f0eaa..9400a9f6318a 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -9,7 +9,7 @@
/* Internal header file for autofs */
-#include <linux/auto_fs4.h>
+#include <linux/auto_fs.h>
#include <linux/auto_dev-ioctl.h>
#include <linux/kernel.h>
@@ -25,7 +25,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/completion.h>
-#include <asm/current.h>
+#include <linux/file.h>
/* This is the range of ioctl() numbers we claim as ours */
#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY
@@ -122,44 +122,44 @@ struct autofs_sb_info {
struct rcu_head rcu;
};
-static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
+static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb)
{
return (struct autofs_sb_info *)(sb->s_fs_info);
}
-static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
+static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry)
{
return (struct autofs_info *)(dentry->d_fsdata);
}
-/* autofs4_oz_mode(): do we see the man behind the curtain? (The
+/* autofs_oz_mode(): do we see the man behind the curtain? (The
* processes which do manipulations for us in user space sees the raw
* filesystem without "magic".)
*/
-static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
+static inline int autofs_oz_mode(struct autofs_sb_info *sbi)
{
return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
-struct inode *autofs4_get_inode(struct super_block *, umode_t);
-void autofs4_free_ino(struct autofs_info *);
+struct inode *autofs_get_inode(struct super_block *, umode_t);
+void autofs_free_ino(struct autofs_info *);
/* Expiration */
-int is_autofs4_dentry(struct dentry *);
-int autofs4_expire_wait(const struct path *path, int rcu_walk);
-int autofs4_expire_run(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *,
- struct autofs_packet_expire __user *);
-int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int when);
-int autofs4_expire_multi(struct super_block *, struct vfsmount *,
- struct autofs_sb_info *, int __user *);
-struct dentry *autofs4_expire_direct(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int how);
-struct dentry *autofs4_expire_indirect(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int how);
+int is_autofs_dentry(struct dentry *);
+int autofs_expire_wait(const struct path *path, int rcu_walk);
+int autofs_expire_run(struct super_block *, struct vfsmount *,
+ struct autofs_sb_info *,
+ struct autofs_packet_expire __user *);
+int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int when);
+int autofs_expire_multi(struct super_block *, struct vfsmount *,
+ struct autofs_sb_info *, int __user *);
+struct dentry *autofs_expire_direct(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int how);
+struct dentry *autofs_expire_indirect(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int how);
/* Device node initialization */
@@ -168,11 +168,11 @@ void autofs_dev_ioctl_exit(void);
/* Operations structures */
-extern const struct inode_operations autofs4_symlink_inode_operations;
-extern const struct inode_operations autofs4_dir_inode_operations;
-extern const struct file_operations autofs4_dir_operations;
-extern const struct file_operations autofs4_root_operations;
-extern const struct dentry_operations autofs4_dentry_operations;
+extern const struct inode_operations autofs_symlink_inode_operations;
+extern const struct inode_operations autofs_dir_inode_operations;
+extern const struct file_operations autofs_dir_operations;
+extern const struct file_operations autofs_root_operations;
+extern const struct dentry_operations autofs_dentry_operations;
/* VFS automount flags management functions */
static inline void __managed_dentry_set_managed(struct dentry *dentry)
@@ -201,9 +201,9 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry)
/* Initializing function */
-int autofs4_fill_super(struct super_block *, void *, int);
-struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
-void autofs4_clean_ino(struct autofs_info *);
+int autofs_fill_super(struct super_block *, void *, int);
+struct autofs_info *autofs_new_ino(struct autofs_sb_info *);
+void autofs_clean_ino(struct autofs_info *);
static inline int autofs_prepare_pipe(struct file *pipe)
{
@@ -218,25 +218,25 @@ static inline int autofs_prepare_pipe(struct file *pipe)
/* Queue management functions */
-int autofs4_wait(struct autofs_sb_info *,
+int autofs_wait(struct autofs_sb_info *,
const struct path *, enum autofs_notify);
-int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
-void autofs4_catatonic_mode(struct autofs_sb_info *);
+int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
+void autofs_catatonic_mode(struct autofs_sb_info *);
-static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
+static inline u32 autofs_get_dev(struct autofs_sb_info *sbi)
{
return new_encode_dev(sbi->sb->s_dev);
}
-static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
+static inline u64 autofs_get_ino(struct autofs_sb_info *sbi)
{
return d_inode(sbi->sb->s_root)->i_ino;
}
-static inline void __autofs4_add_expiring(struct dentry *dentry)
+static inline void __autofs_add_expiring(struct dentry *dentry)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
if (ino) {
if (list_empty(&ino->expiring))
@@ -244,10 +244,10 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
}
}
-static inline void autofs4_add_expiring(struct dentry *dentry)
+static inline void autofs_add_expiring(struct dentry *dentry)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
@@ -257,10 +257,10 @@ static inline void autofs4_add_expiring(struct dentry *dentry)
}
}
-static inline void autofs4_del_expiring(struct dentry *dentry)
+static inline void autofs_del_expiring(struct dentry *dentry)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
@@ -270,4 +270,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry)
}
}
-void autofs4_kill_sb(struct super_block *);
+void autofs_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 26f6b4f41ce6..ea4ca1445ab7 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -7,23 +7,10 @@
* option, any later version, incorporated herein by reference.
*/
-#include <linux/module.h>
-#include <linux/vmalloc.h>
#include <linux/miscdevice.h>
-#include <linux/init.h>
-#include <linux/wait.h>
-#include <linux/namei.h>
-#include <linux/fcntl.h>
-#include <linux/file.h>
-#include <linux/fdtable.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
#include <linux/magic.h>
-#include <linux/dcache.h>
-#include <linux/uaccess.h>
-#include <linux/slab.h>
#include "autofs_i.h"
@@ -166,7 +153,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f)
if (f) {
inode = file_inode(f);
- sbi = autofs4_sbi(inode->i_sb);
+ sbi = autofs_sbi(inode->i_sb);
}
return sbi;
}
@@ -236,7 +223,7 @@ static int test_by_dev(const struct path *path, void *p)
static int test_by_type(const struct path *path, void *p)
{
- struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+ struct autofs_info *ino = autofs_dentry_ino(path->dentry);
return ino && ino->sbi->type & *(unsigned *)p;
}
@@ -324,7 +311,7 @@ static int autofs_dev_ioctl_ready(struct file *fp,
autofs_wqt_t token;
token = (autofs_wqt_t) param->ready.token;
- return autofs4_wait_release(sbi, token, 0);
+ return autofs_wait_release(sbi, token, 0);
}
/*
@@ -340,7 +327,7 @@ static int autofs_dev_ioctl_fail(struct file *fp,
token = (autofs_wqt_t) param->fail.token;
status = param->fail.status < 0 ? param->fail.status : -ENOENT;
- return autofs4_wait_release(sbi, token, status);
+ return autofs_wait_release(sbi, token, status);
}
/*
@@ -412,7 +399,7 @@ static int autofs_dev_ioctl_catatonic(struct file *fp,
struct autofs_sb_info *sbi,
struct autofs_dev_ioctl *param)
{
- autofs4_catatonic_mode(sbi);
+ autofs_catatonic_mode(sbi);
return 0;
}
@@ -459,10 +446,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
if (err)
goto out;
- ino = autofs4_dentry_ino(path.dentry);
+ ino = autofs_dentry_ino(path.dentry);
if (ino) {
err = 0;
- autofs4_expire_wait(&path, 0);
+ autofs_expire_wait(&path, 0);
spin_lock(&sbi->fs_lock);
param->requester.uid =
from_kuid_munged(current_user_ns(), ino->uid);
@@ -489,7 +476,7 @@ static int autofs_dev_ioctl_expire(struct file *fp,
how = param->expire.how;
mnt = fp->f_path.mnt;
- return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
+ return autofs_do_expire_multi(sbi->sb, mnt, sbi, how);
}
/* Check if autofs mount point is in use */
@@ -686,7 +673,7 @@ static int _autofs_dev_ioctl(unsigned int command,
* Admin needs to be able to set the mount catatonic in
* order to be able to perform the re-open.
*/
- if (!autofs4_oz_mode(sbi) &&
+ if (!autofs_oz_mode(sbi) &&
cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) {
err = -EACCES;
fput(fp);
diff --git a/fs/autofs4/expire.c b/fs/autofs/expire.c
index 57725d4a8c59..b332d3f6e730 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs/expire.c
@@ -13,10 +13,10 @@
static unsigned long now;
/* Check if a dentry can be expired */
-static inline int autofs4_can_expire(struct dentry *dentry,
- unsigned long timeout, int do_now)
+static inline int autofs_can_expire(struct dentry *dentry,
+ unsigned long timeout, int do_now)
{
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
/* dentry in the process of being deleted */
if (ino == NULL)
@@ -31,7 +31,7 @@ static inline int autofs4_can_expire(struct dentry *dentry,
}
/* Check a mount point for busyness */
-static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
+static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
{
struct dentry *top = dentry;
struct path path = {.mnt = mnt, .dentry = dentry};
@@ -44,8 +44,8 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
if (!follow_down_one(&path))
goto done;
- if (is_autofs4_dentry(path.dentry)) {
- struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
+ if (is_autofs_dentry(path.dentry)) {
+ struct autofs_sb_info *sbi = autofs_sbi(path.dentry->d_sb);
/* This is an autofs submount, we can't expire it */
if (autofs_type_indirect(sbi->type))
@@ -56,7 +56,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
if (!may_umount_tree(path.mnt)) {
struct autofs_info *ino;
- ino = autofs4_dentry_ino(top);
+ ino = autofs_dentry_ino(top);
ino->last_used = jiffies;
goto done;
}
@@ -74,7 +74,7 @@ done:
static struct dentry *get_next_positive_subdir(struct dentry *prev,
struct dentry *root)
{
- struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(root->d_sb);
struct list_head *next;
struct dentry *q;
@@ -121,7 +121,7 @@ cont:
static struct dentry *get_next_positive_dentry(struct dentry *prev,
struct dentry *root)
{
- struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(root->d_sb);
struct list_head *next;
struct dentry *p, *ret;
@@ -184,10 +184,10 @@ again:
* The tree is not busy iff no mountpoints are busy and there are no
* autofs submounts.
*/
-static int autofs4_direct_busy(struct vfsmount *mnt,
- struct dentry *top,
- unsigned long timeout,
- int do_now)
+static int autofs_direct_busy(struct vfsmount *mnt,
+ struct dentry *top,
+ unsigned long timeout,
+ int do_now)
{
pr_debug("top %p %pd\n", top, top);
@@ -195,14 +195,14 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
if (!may_umount_tree(mnt)) {
struct autofs_info *ino;
- ino = autofs4_dentry_ino(top);
+ ino = autofs_dentry_ino(top);
if (ino)
ino->last_used = jiffies;
return 1;
}
/* Timeout of a direct mount is determined by its top dentry */
- if (!autofs4_can_expire(top, timeout, do_now))
+ if (!autofs_can_expire(top, timeout, do_now))
return 1;
return 0;
@@ -212,12 +212,12 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
* Check a directory tree of mount points for busyness
* The tree is not busy iff no mountpoints are busy
*/
-static int autofs4_tree_busy(struct vfsmount *mnt,
- struct dentry *top,
- unsigned long timeout,
- int do_now)
+static int autofs_tree_busy(struct vfsmount *mnt,
+ struct dentry *top,
+ unsigned long timeout,
+ int do_now)
{
- struct autofs_info *top_ino = autofs4_dentry_ino(top);
+ struct autofs_info *top_ino = autofs_dentry_ino(top);
struct dentry *p;
pr_debug("top %p %pd\n", top, top);
@@ -237,13 +237,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
* If the fs is busy update the expiry counter.
*/
if (d_mountpoint(p)) {
- if (autofs4_mount_busy(mnt, p)) {
+ if (autofs_mount_busy(mnt, p)) {
top_ino->last_used = jiffies;
dput(p);
return 1;
}
} else {
- struct autofs_info *ino = autofs4_dentry_ino(p);
+ struct autofs_info *ino = autofs_dentry_ino(p);
unsigned int ino_count = atomic_read(&ino->count);
/* allow for dget above and top is already dgot */
@@ -261,16 +261,16 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
}
/* Timeout of a tree mount is ultimately determined by its top dentry */
- if (!autofs4_can_expire(top, timeout, do_now))
+ if (!autofs_can_expire(top, timeout, do_now))
return 1;
return 0;
}
-static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
- struct dentry *parent,
- unsigned long timeout,
- int do_now)
+static struct dentry *autofs_check_leaves(struct vfsmount *mnt,
+ struct dentry *parent,
+ unsigned long timeout,
+ int do_now)
{
struct dentry *p;
@@ -282,11 +282,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
if (d_mountpoint(p)) {
/* Can we umount this guy */
- if (autofs4_mount_busy(mnt, p))
+ if (autofs_mount_busy(mnt, p))
continue;
/* Can we expire this guy */
- if (autofs4_can_expire(p, timeout, do_now))
+ if (autofs_can_expire(p, timeout, do_now))
return p;
}
}
@@ -294,10 +294,10 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt,
}
/* Check if we can expire a direct mount (possibly a tree) */
-struct dentry *autofs4_expire_direct(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- int how)
+struct dentry *autofs_expire_direct(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ int how)
{
unsigned long timeout;
struct dentry *root = dget(sb->s_root);
@@ -310,9 +310,9 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
now = jiffies;
timeout = sbi->exp_timeout;
- if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+ if (!autofs_direct_busy(mnt, root, timeout, do_now)) {
spin_lock(&sbi->fs_lock);
- ino = autofs4_dentry_ino(root);
+ ino = autofs_dentry_ino(root);
/* No point expiring a pending mount */
if (ino->flags & AUTOFS_INF_PENDING) {
spin_unlock(&sbi->fs_lock);
@@ -321,7 +321,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
- if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+ if (!autofs_direct_busy(mnt, root, timeout, do_now)) {
spin_lock(&sbi->fs_lock);
ino->flags |= AUTOFS_INF_EXPIRING;
init_completion(&ino->expire_complete);
@@ -350,7 +350,7 @@ static struct dentry *should_expire(struct dentry *dentry,
{
int do_now = how & AUTOFS_EXP_IMMEDIATE;
int exp_leaves = how & AUTOFS_EXP_LEAVES;
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
unsigned int ino_count;
/* No point expiring a pending mount */
@@ -367,11 +367,11 @@ static struct dentry *should_expire(struct dentry *dentry,
pr_debug("checking mountpoint %p %pd\n", dentry, dentry);
/* Can we umount this guy */
- if (autofs4_mount_busy(mnt, dentry))
+ if (autofs_mount_busy(mnt, dentry))
return NULL;
/* Can we expire this guy */
- if (autofs4_can_expire(dentry, timeout, do_now))
+ if (autofs_can_expire(dentry, timeout, do_now))
return dentry;
return NULL;
}
@@ -382,7 +382,7 @@ static struct dentry *should_expire(struct dentry *dentry,
* A symlink can't be "busy" in the usual sense so
* just check last used for expire timeout.
*/
- if (autofs4_can_expire(dentry, timeout, do_now))
+ if (autofs_can_expire(dentry, timeout, do_now))
return dentry;
return NULL;
}
@@ -397,7 +397,7 @@ static struct dentry *should_expire(struct dentry *dentry,
if (d_count(dentry) > ino_count)
return NULL;
- if (!autofs4_tree_busy(mnt, dentry, timeout, do_now))
+ if (!autofs_tree_busy(mnt, dentry, timeout, do_now))
return dentry;
/*
* Case 3: pseudo direct mount, expire individual leaves
@@ -411,7 +411,7 @@ static struct dentry *should_expire(struct dentry *dentry,
if (d_count(dentry) > ino_count)
return NULL;
- expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
+ expired = autofs_check_leaves(mnt, dentry, timeout, do_now);
if (expired) {
if (expired == dentry)
dput(dentry);
@@ -427,10 +427,10 @@ static struct dentry *should_expire(struct dentry *dentry,
* - it is unused by any user process
* - it has been unused for exp_timeout time
*/
-struct dentry *autofs4_expire_indirect(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- int how)
+struct dentry *autofs_expire_indirect(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ int how)
{
unsigned long timeout;
struct dentry *root = sb->s_root;
@@ -450,7 +450,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
int flags = how;
spin_lock(&sbi->fs_lock);
- ino = autofs4_dentry_ino(dentry);
+ ino = autofs_dentry_ino(dentry);
if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
spin_unlock(&sbi->fs_lock);
continue;
@@ -462,7 +462,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
continue;
spin_lock(&sbi->fs_lock);
- ino = autofs4_dentry_ino(expired);
+ ino = autofs_dentry_ino(expired);
ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
@@ -498,11 +498,11 @@ found:
return expired;
}
-int autofs4_expire_wait(const struct path *path, int rcu_walk)
+int autofs_expire_wait(const struct path *path, int rcu_walk)
{
struct dentry *dentry = path->dentry;
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
int status;
int state;
@@ -529,7 +529,7 @@ retry:
pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
- status = autofs4_wait(sbi, path, NFY_NONE);
+ status = autofs_wait(sbi, path, NFY_NONE);
wait_for_completion(&ino->expire_complete);
pr_debug("expire done status=%d\n", status);
@@ -545,10 +545,10 @@ retry:
}
/* Perform an expiry operation */
-int autofs4_expire_run(struct super_block *sb,
- struct vfsmount *mnt,
- struct autofs_sb_info *sbi,
- struct autofs_packet_expire __user *pkt_p)
+int autofs_expire_run(struct super_block *sb,
+ struct vfsmount *mnt,
+ struct autofs_sb_info *sbi,
+ struct autofs_packet_expire __user *pkt_p)
{
struct autofs_packet_expire pkt;
struct autofs_info *ino;
@@ -560,7 +560,7 @@ int autofs4_expire_run(struct super_block *sb,
pkt.hdr.proto_version = sbi->version;
pkt.hdr.type = autofs_ptype_expire;
- dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
+ dentry = autofs_expire_indirect(sb, mnt, sbi, 0);
if (!dentry)
return -EAGAIN;
@@ -573,7 +573,7 @@ int autofs4_expire_run(struct super_block *sb,
ret = -EFAULT;
spin_lock(&sbi->fs_lock);
- ino = autofs4_dentry_ino(dentry);
+ ino = autofs_dentry_ino(dentry);
/* avoid rapid-fire expire attempts if expiry fails */
ino->last_used = now;
ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE);
@@ -583,25 +583,25 @@ int autofs4_expire_run(struct super_block *sb,
return ret;
}
-int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
- struct autofs_sb_info *sbi, int when)
+int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+ struct autofs_sb_info *sbi, int when)
{
struct dentry *dentry;
int ret = -EAGAIN;
if (autofs_type_trigger(sbi->type))
- dentry = autofs4_expire_direct(sb, mnt, sbi, when);
+ dentry = autofs_expire_direct(sb, mnt, sbi, when);
else
- dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
+ dentry = autofs_expire_indirect(sb, mnt, sbi, when);
if (dentry) {
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
const struct path path = { .mnt = mnt, .dentry = dentry };
/* This is synchronous because it makes the daemon a
* little easier
*/
- ret = autofs4_wait(sbi, &path, NFY_EXPIRE);
+ ret = autofs_wait(sbi, &path, NFY_EXPIRE);
spin_lock(&sbi->fs_lock);
/* avoid rapid-fire expire attempts if expiry fails */
@@ -619,7 +619,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
* Call repeatedly until it returns -EAGAIN, meaning there's nothing
* more to be done.
*/
-int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt,
struct autofs_sb_info *sbi, int __user *arg)
{
int do_now = 0;
@@ -627,6 +627,5 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
if (arg && get_user(do_now, arg))
return -EFAULT;
- return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
+ return autofs_do_expire_multi(sb, mnt, sbi, do_now);
}
-
diff --git a/fs/autofs4/init.c b/fs/autofs/init.c
index 8cf0e63389ae..16fb61315843 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs/init.c
@@ -13,18 +13,18 @@
static struct dentry *autofs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_nodev(fs_type, flags, data, autofs4_fill_super);
+ return mount_nodev(fs_type, flags, data, autofs_fill_super);
}
static struct file_system_type autofs_fs_type = {
.owner = THIS_MODULE,
.name = "autofs",
.mount = autofs_mount,
- .kill_sb = autofs4_kill_sb,
+ .kill_sb = autofs_kill_sb,
};
MODULE_ALIAS_FS("autofs");
-static int __init init_autofs4_fs(void)
+static int __init init_autofs_fs(void)
{
int err;
@@ -37,12 +37,12 @@ static int __init init_autofs4_fs(void)
return err;
}
-static void __exit exit_autofs4_fs(void)
+static void __exit exit_autofs_fs(void)
{
autofs_dev_ioctl_exit();
unregister_filesystem(&autofs_fs_type);
}
-module_init(init_autofs4_fs)
-module_exit(exit_autofs4_fs)
+module_init(init_autofs_fs)
+module_exit(exit_autofs_fs)
MODULE_LICENSE("GPL");
diff --git a/fs/autofs4/inode.c b/fs/autofs/inode.c
index 09e7d68dff02..b51980fc274e 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs/inode.c
@@ -7,18 +7,14 @@
* option, any later version, incorporated herein by reference.
*/
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/file.h>
#include <linux/seq_file.h>
#include <linux/pagemap.h>
#include <linux/parser.h>
-#include <linux/bitops.h>
#include <linux/magic.h>
+
#include "autofs_i.h"
-#include <linux/module.h>
-struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
+struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi)
{
struct autofs_info *ino;
@@ -32,21 +28,21 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
return ino;
}
-void autofs4_clean_ino(struct autofs_info *ino)
+void autofs_clean_ino(struct autofs_info *ino)
{
ino->uid = GLOBAL_ROOT_UID;
ino->gid = GLOBAL_ROOT_GID;
ino->last_used = jiffies;
}
-void autofs4_free_ino(struct autofs_info *ino)
+void autofs_free_ino(struct autofs_info *ino)
{
kfree(ino);
}
-void autofs4_kill_sb(struct super_block *sb)
+void autofs_kill_sb(struct super_block *sb)
{
- struct autofs_sb_info *sbi = autofs4_sbi(sb);
+ struct autofs_sb_info *sbi = autofs_sbi(sb);
/*
* In the event of a failure in get_sb_nodev the superblock
@@ -56,7 +52,7 @@ void autofs4_kill_sb(struct super_block *sb)
*/
if (sbi) {
/* Free wait queues, close pipe */
- autofs4_catatonic_mode(sbi);
+ autofs_catatonic_mode(sbi);
put_pid(sbi->oz_pgrp);
}
@@ -66,9 +62,9 @@ void autofs4_kill_sb(struct super_block *sb)
kfree_rcu(sbi, rcu);
}
-static int autofs4_show_options(struct seq_file *m, struct dentry *root)
+static int autofs_show_options(struct seq_file *m, struct dentry *root)
{
- struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(root->d_sb);
struct inode *root_inode = d_inode(root->d_sb->s_root);
if (!sbi)
@@ -101,16 +97,16 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-static void autofs4_evict_inode(struct inode *inode)
+static void autofs_evict_inode(struct inode *inode)
{
clear_inode(inode);
kfree(inode->i_private);
}
-static const struct super_operations autofs4_sops = {
+static const struct super_operations autofs_sops = {
.statfs = simple_statfs,
- .show_options = autofs4_show_options,
- .evict_inode = autofs4_evict_inode,
+ .show_options = autofs_show_options,
+ .evict_inode = autofs_evict_inode,
};
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
@@ -206,7 +202,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
return (*pipefd < 0);
}
-int autofs4_fill_super(struct super_block *s, void *data, int silent)
+int autofs_fill_super(struct super_block *s, void *data, int silent)
{
struct inode *root_inode;
struct dentry *root;
@@ -246,19 +242,19 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = AUTOFS_SUPER_MAGIC;
- s->s_op = &autofs4_sops;
- s->s_d_op = &autofs4_dentry_operations;
+ s->s_op = &autofs_sops;
+ s->s_d_op = &autofs_dentry_operations;
s->s_time_gran = 1;
/*
* Get the root inode and dentry, but defer checking for errors.
*/
- ino = autofs4_new_ino(sbi);
+ ino = autofs_new_ino(sbi);
if (!ino) {
ret = -ENOMEM;
goto fail_free;
}
- root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
+ root_inode = autofs_get_inode(s, S_IFDIR | 0755);
root = d_make_root(root_inode);
if (!root)
goto fail_ino;
@@ -305,8 +301,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
if (autofs_type_trigger(sbi->type))
__managed_dentry_set_managed(root);
- root_inode->i_fop = &autofs4_root_operations;
- root_inode->i_op = &autofs4_dir_inode_operations;
+ root_inode->i_fop = &autofs_root_operations;
+ root_inode->i_op = &autofs_dir_inode_operations;
pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
@@ -340,14 +336,14 @@ fail_dput:
dput(root);
goto fail_free;
fail_ino:
- autofs4_free_ino(ino);
+ autofs_free_ino(ino);
fail_free:
kfree(sbi);
s->s_fs_info = NULL;
return ret;
}
-struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
+struct inode *autofs_get_inode(struct super_block *sb, umode_t mode)
{
struct inode *inode = new_inode(sb);
@@ -364,10 +360,10 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
if (S_ISDIR(mode)) {
set_nlink(inode, 2);
- inode->i_op = &autofs4_dir_inode_operations;
- inode->i_fop = &autofs4_dir_operations;
+ inode->i_op = &autofs_dir_inode_operations;
+ inode->i_fop = &autofs_dir_operations;
} else if (S_ISLNK(mode)) {
- inode->i_op = &autofs4_symlink_inode_operations;
+ inode->i_op = &autofs_symlink_inode_operations;
} else
WARN_ON(1);
diff --git a/fs/autofs4/root.c b/fs/autofs/root.c
index b12e37f27530..a3d414150578 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs/root.c
@@ -9,72 +9,66 @@
*/
#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/stat.h>
-#include <linux/slab.h>
-#include <linux/param.h>
-#include <linux/time.h>
#include <linux/compat.h>
-#include <linux/mutex.h>
#include "autofs_i.h"
-static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
-static int autofs4_dir_unlink(struct inode *, struct dentry *);
-static int autofs4_dir_rmdir(struct inode *, struct dentry *);
-static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
-static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
+static int autofs_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs_dir_unlink(struct inode *, struct dentry *);
+static int autofs_dir_rmdir(struct inode *, struct dentry *);
+static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static long autofs_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *,
- unsigned int, unsigned long);
+static long autofs_root_compat_ioctl(struct file *,
+ unsigned int, unsigned long);
#endif
-static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,
- struct dentry *, unsigned int);
-static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(const struct path *, bool);
-static void autofs4_dentry_release(struct dentry *);
-
-const struct file_operations autofs4_root_operations = {
+static int autofs_dir_open(struct inode *inode, struct file *file);
+static struct dentry *autofs_lookup(struct inode *,
+ struct dentry *, unsigned int);
+static struct vfsmount *autofs_d_automount(struct path *);
+static int autofs_d_manage(const struct path *, bool);
+static void autofs_dentry_release(struct dentry *);
+
+const struct file_operations autofs_root_operations = {
.open = dcache_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
.iterate_shared = dcache_readdir,
.llseek = dcache_dir_lseek,
- .unlocked_ioctl = autofs4_root_ioctl,
+ .unlocked_ioctl = autofs_root_ioctl,
#ifdef CONFIG_COMPAT
- .compat_ioctl = autofs4_root_compat_ioctl,
+ .compat_ioctl = autofs_root_compat_ioctl,
#endif
};
-const struct file_operations autofs4_dir_operations = {
- .open = autofs4_dir_open,
+const struct file_operations autofs_dir_operations = {
+ .open = autofs_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
.iterate_shared = dcache_readdir,
.llseek = dcache_dir_lseek,
};
-const struct inode_operations autofs4_dir_inode_operations = {
- .lookup = autofs4_lookup,
- .unlink = autofs4_dir_unlink,
- .symlink = autofs4_dir_symlink,
- .mkdir = autofs4_dir_mkdir,
- .rmdir = autofs4_dir_rmdir,
+const struct inode_operations autofs_dir_inode_operations = {
+ .lookup = autofs_lookup,
+ .unlink = autofs_dir_unlink,
+ .symlink = autofs_dir_symlink,
+ .mkdir = autofs_dir_mkdir,
+ .rmdir = autofs_dir_rmdir,
};
-const struct dentry_operations autofs4_dentry_operations = {
- .d_automount = autofs4_d_automount,
- .d_manage = autofs4_d_manage,
- .d_release = autofs4_dentry_release,
+const struct dentry_operations autofs_dentry_operations = {
+ .d_automount = autofs_d_automount,
+ .d_manage = autofs_d_manage,
+ .d_release = autofs_dentry_release,
};
-static void autofs4_add_active(struct dentry *dentry)
+static void autofs_add_active(struct dentry *dentry)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
struct autofs_info *ino;
- ino = autofs4_dentry_ino(dentry);
+ ino = autofs_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
if (!ino->active_count) {
@@ -86,12 +80,12 @@ static void autofs4_add_active(struct dentry *dentry)
}
}
-static void autofs4_del_active(struct dentry *dentry)
+static void autofs_del_active(struct dentry *dentry)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
struct autofs_info *ino;
- ino = autofs4_dentry_ino(dentry);
+ ino = autofs_dentry_ino(dentry);
if (ino) {
spin_lock(&sbi->lookup_lock);
ino->active_count--;
@@ -103,14 +97,14 @@ static void autofs4_del_active(struct dentry *dentry)
}
}
-static int autofs4_dir_open(struct inode *inode, struct file *file)
+static int autofs_dir_open(struct inode *inode, struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry);
- if (autofs4_oz_mode(sbi))
+ if (autofs_oz_mode(sbi))
goto out;
/*
@@ -133,10 +127,10 @@ out:
return dcache_dir_open(inode, file);
}
-static void autofs4_dentry_release(struct dentry *de)
+static void autofs_dentry_release(struct dentry *de)
{
- struct autofs_info *ino = autofs4_dentry_ino(de);
- struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(de);
+ struct autofs_sb_info *sbi = autofs_sbi(de->d_sb);
pr_debug("releasing %p\n", de);
@@ -152,12 +146,12 @@ static void autofs4_dentry_release(struct dentry *de)
spin_unlock(&sbi->lookup_lock);
}
- autofs4_free_ino(ino);
+ autofs_free_ino(ino);
}
-static struct dentry *autofs4_lookup_active(struct dentry *dentry)
+static struct dentry *autofs_lookup_active(struct dentry *dentry)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
struct dentry *parent = dentry->d_parent;
const struct qstr *name = &dentry->d_name;
unsigned int len = name->len;
@@ -209,10 +203,10 @@ next:
return NULL;
}
-static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
- bool rcu_walk)
+static struct dentry *autofs_lookup_expiring(struct dentry *dentry,
+ bool rcu_walk)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
struct dentry *parent = dentry->d_parent;
const struct qstr *name = &dentry->d_name;
unsigned int len = name->len;
@@ -269,17 +263,17 @@ next:
return NULL;
}
-static int autofs4_mount_wait(const struct path *path, bool rcu_walk)
+static int autofs_mount_wait(const struct path *path, bool rcu_walk)
{
- struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(path->dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(path->dentry);
int status = 0;
if (ino->flags & AUTOFS_INF_PENDING) {
if (rcu_walk)
return -ECHILD;
pr_debug("waiting for mount name=%pd\n", path->dentry);
- status = autofs4_wait(sbi, path, NFY_MOUNT);
+ status = autofs_wait(sbi, path, NFY_MOUNT);
pr_debug("mount wait done status=%d\n", status);
}
ino->last_used = jiffies;
@@ -291,11 +285,11 @@ static int do_expire_wait(const struct path *path, bool rcu_walk)
struct dentry *dentry = path->dentry;
struct dentry *expiring;
- expiring = autofs4_lookup_expiring(dentry, rcu_walk);
+ expiring = autofs_lookup_expiring(dentry, rcu_walk);
if (IS_ERR(expiring))
return PTR_ERR(expiring);
if (!expiring)
- return autofs4_expire_wait(path, rcu_walk);
+ return autofs_expire_wait(path, rcu_walk);
else {
const struct path this = { .mnt = path->mnt, .dentry = expiring };
/*
@@ -303,17 +297,17 @@ static int do_expire_wait(const struct path *path, bool rcu_walk)
* be quite complete, but the directory has been removed
* so it must have been successful, just wait for it.
*/
- autofs4_expire_wait(&this, 0);
- autofs4_del_expiring(expiring);
+ autofs_expire_wait(&this, 0);
+ autofs_del_expiring(expiring);
dput(expiring);
}
return 0;
}
-static struct dentry *autofs4_mountpoint_changed(struct path *path)
+static struct dentry *autofs_mountpoint_changed(struct path *path)
{
struct dentry *dentry = path->dentry;
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
/*
* If this is an indirect mount the dentry could have gone away
@@ -327,7 +321,7 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
new = d_lookup(parent, &dentry->d_name);
if (!new)
return NULL;
- ino = autofs4_dentry_ino(new);
+ ino = autofs_dentry_ino(new);
ino->last_used = jiffies;
dput(path->dentry);
path->dentry = new;
@@ -335,17 +329,17 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
return path->dentry;
}
-static struct vfsmount *autofs4_d_automount(struct path *path)
+static struct vfsmount *autofs_d_automount(struct path *path)
{
struct dentry *dentry = path->dentry;
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
int status;
pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never triggers a mount. */
- if (autofs4_oz_mode(sbi))
+ if (autofs_oz_mode(sbi))
return NULL;
/*
@@ -364,7 +358,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
spin_lock(&sbi->fs_lock);
if (ino->flags & AUTOFS_INF_PENDING) {
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(path, 0);
+ status = autofs_mount_wait(path, 0);
if (status)
return ERR_PTR(status);
goto done;
@@ -405,7 +399,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
}
ino->flags |= AUTOFS_INF_PENDING;
spin_unlock(&sbi->fs_lock);
- status = autofs4_mount_wait(path, 0);
+ status = autofs_mount_wait(path, 0);
spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_PENDING;
if (status) {
@@ -416,24 +410,24 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
spin_unlock(&sbi->fs_lock);
done:
/* Mount succeeded, check if we ended up with a new dentry */
- dentry = autofs4_mountpoint_changed(path);
+ dentry = autofs_mountpoint_changed(path);
if (!dentry)
return ERR_PTR(-ENOENT);
return NULL;
}
-static int autofs4_d_manage(const struct path *path, bool rcu_walk)
+static int autofs_d_manage(const struct path *path, bool rcu_walk)
{
struct dentry *dentry = path->dentry;
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
int status;
pr_debug("dentry=%p %pd\n", dentry, dentry);
/* The daemon never waits. */
- if (autofs4_oz_mode(sbi)) {
+ if (autofs_oz_mode(sbi)) {
if (!path_is_mountpoint(path))
return -EISDIR;
return 0;
@@ -447,7 +441,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk)
* This dentry may be under construction so wait on mount
* completion.
*/
- status = autofs4_mount_wait(path, rcu_walk);
+ status = autofs_mount_wait(path, rcu_walk);
if (status)
return status;
@@ -500,8 +494,8 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk)
}
/* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir,
- struct dentry *dentry, unsigned int flags)
+static struct dentry *autofs_lookup(struct inode *dir,
+ struct dentry *dentry, unsigned int flags)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
@@ -513,13 +507,13 @@ static struct dentry *autofs4_lookup(struct inode *dir,
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- sbi = autofs4_sbi(dir->i_sb);
+ sbi = autofs_sbi(dir->i_sb);
pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n",
current->pid, task_pgrp_nr(current), sbi->catatonic,
- autofs4_oz_mode(sbi));
+ autofs_oz_mode(sbi));
- active = autofs4_lookup_active(dentry);
+ active = autofs_lookup_active(dentry);
if (active)
return active;
else {
@@ -529,7 +523,7 @@ static struct dentry *autofs4_lookup(struct inode *dir,
* can return fail immediately. The daemon however does need
* to create directories within the file system.
*/
- if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
+ if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent))
return ERR_PTR(-ENOENT);
/* Mark entries in the root as mount triggers */
@@ -537,24 +531,24 @@ static struct dentry *autofs4_lookup(struct inode *dir,
autofs_type_indirect(sbi->type))
__managed_dentry_set_managed(dentry);
- ino = autofs4_new_ino(sbi);
+ ino = autofs_new_ino(sbi);
if (!ino)
return ERR_PTR(-ENOMEM);
dentry->d_fsdata = ino;
ino->dentry = dentry;
- autofs4_add_active(dentry);
+ autofs_add_active(dentry);
}
return NULL;
}
-static int autofs4_dir_symlink(struct inode *dir,
+static int autofs_dir_symlink(struct inode *dir,
struct dentry *dentry,
const char *symname)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
struct inode *inode;
size_t size = strlen(symname);
@@ -562,14 +556,14 @@ static int autofs4_dir_symlink(struct inode *dir,
pr_debug("%s <- %pd\n", symname, dentry);
- if (!autofs4_oz_mode(sbi))
+ if (!autofs_oz_mode(sbi))
return -EACCES;
BUG_ON(!ino);
- autofs4_clean_ino(ino);
+ autofs_clean_ino(ino);
- autofs4_del_active(dentry);
+ autofs_del_active(dentry);
cp = kmalloc(size + 1, GFP_KERNEL);
if (!cp)
@@ -577,7 +571,7 @@ static int autofs4_dir_symlink(struct inode *dir,
strcpy(cp, symname);
- inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555);
+ inode = autofs_get_inode(dir->i_sb, S_IFLNK | 0555);
if (!inode) {
kfree(cp);
return -ENOMEM;
@@ -588,7 +582,7 @@ static int autofs4_dir_symlink(struct inode *dir,
dget(dentry);
atomic_inc(&ino->count);
- p_ino = autofs4_dentry_ino(dentry->d_parent);
+ p_ino = autofs_dentry_ino(dentry->d_parent);
if (p_ino && !IS_ROOT(dentry))
atomic_inc(&p_ino->count);
@@ -610,20 +604,20 @@ static int autofs4_dir_symlink(struct inode *dir,
* If a process is blocked on the dentry waiting for the expire to finish,
* it will invalidate the dentry and try to mount with a new one.
*
- * Also see autofs4_dir_rmdir()..
+ * Also see autofs_dir_rmdir()..
*/
-static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
+static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
/* This allows root to remove symlinks */
- if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+ if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
if (atomic_dec_and_test(&ino->count)) {
- p_ino = autofs4_dentry_ino(dentry->d_parent);
+ p_ino = autofs_dentry_ino(dentry->d_parent);
if (p_ino && !IS_ROOT(dentry))
atomic_dec(&p_ino->count);
}
@@ -635,7 +629,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
dir->i_mtime = current_time(dir);
spin_lock(&sbi->lookup_lock);
- __autofs4_add_expiring(dentry);
+ __autofs_add_expiring(dentry);
d_drop(dentry);
spin_unlock(&sbi->lookup_lock);
@@ -692,15 +686,15 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
managed_dentry_set_managed(parent);
}
-static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
+static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
pr_debug("dentry %p, removing %pd\n", dentry, dentry);
- if (!autofs4_oz_mode(sbi))
+ if (!autofs_oz_mode(sbi))
return -EACCES;
spin_lock(&sbi->lookup_lock);
@@ -708,7 +702,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
spin_unlock(&sbi->lookup_lock);
return -ENOTEMPTY;
}
- __autofs4_add_expiring(dentry);
+ __autofs_add_expiring(dentry);
d_drop(dentry);
spin_unlock(&sbi->lookup_lock);
@@ -716,7 +710,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
autofs_clear_leaf_automount_flags(dentry);
if (atomic_dec_and_test(&ino->count)) {
- p_ino = autofs4_dentry_ino(dentry->d_parent);
+ p_ino = autofs_dentry_ino(dentry->d_parent);
if (p_ino && dentry->d_parent != dentry)
atomic_dec(&p_ino->count);
}
@@ -730,26 +724,26 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs4_dir_mkdir(struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static int autofs_dir_mkdir(struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
- struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
+ struct autofs_info *ino = autofs_dentry_ino(dentry);
struct autofs_info *p_ino;
struct inode *inode;
- if (!autofs4_oz_mode(sbi))
+ if (!autofs_oz_mode(sbi))
return -EACCES;
pr_debug("dentry %p, creating %pd\n", dentry, dentry);
BUG_ON(!ino);
- autofs4_clean_ino(ino);
+ autofs_clean_ino(ino);
- autofs4_del_active(dentry);
+ autofs_del_active(dentry);
- inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode);
+ inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode);
if (!inode)
return -ENOMEM;
d_add(dentry, inode);
@@ -759,7 +753,7 @@ static int autofs4_dir_mkdir(struct inode *dir,
dget(dentry);
atomic_inc(&ino->count);
- p_ino = autofs4_dentry_ino(dentry->d_parent);
+ p_ino = autofs_dentry_ino(dentry->d_parent);
if (p_ino && !IS_ROOT(dentry))
atomic_inc(&p_ino->count);
inc_nlink(dir);
@@ -770,7 +764,7 @@ static int autofs4_dir_mkdir(struct inode *dir,
/* Get/set timeout ioctl() operation */
#ifdef CONFIG_COMPAT
-static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
+static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi,
compat_ulong_t __user *p)
{
unsigned long ntimeout;
@@ -795,7 +789,7 @@ error:
}
#endif
-static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
+static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi,
unsigned long __user *p)
{
unsigned long ntimeout;
@@ -820,14 +814,14 @@ error:
}
/* Return protocol version */
-static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
+static inline int autofs_get_protover(struct autofs_sb_info *sbi,
int __user *p)
{
return put_user(sbi->version, p);
}
/* Return protocol sub version */
-static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
+static inline int autofs_get_protosubver(struct autofs_sb_info *sbi,
int __user *p)
{
return put_user(sbi->sub_version, p);
@@ -836,7 +830,7 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
/*
* Tells the daemon whether it can umount the autofs mount.
*/
-static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
+static inline int autofs_ask_umount(struct vfsmount *mnt, int __user *p)
{
int status = 0;
@@ -850,14 +844,14 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
return status;
}
-/* Identify autofs4_dentries - this is so we can tell if there's
+/* Identify autofs_dentries - this is so we can tell if there's
* an extra dentry refcount or not. We only hold a refcount on the
* dentry if its non-negative (ie, d_inode != NULL)
*/
-int is_autofs4_dentry(struct dentry *dentry)
+int is_autofs_dentry(struct dentry *dentry)
{
return dentry && d_really_is_positive(dentry) &&
- dentry->d_op == &autofs4_dentry_operations &&
+ dentry->d_op == &autofs_dentry_operations &&
dentry->d_fsdata != NULL;
}
@@ -865,10 +859,10 @@ int is_autofs4_dentry(struct dentry *dentry)
* ioctl()'s on the root directory is the chief method for the daemon to
* generate kernel reactions
*/
-static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
+static int autofs_root_ioctl_unlocked(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
{
- struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
+ struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb);
void __user *p = (void __user *)arg;
pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n",
@@ -878,64 +872,63 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
_IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
return -ENOTTY;
- if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+ if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */
- return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
+ return autofs_wait_release(sbi, (autofs_wqt_t) arg, 0);
case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */
- return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
+ return autofs_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
- autofs4_catatonic_mode(sbi);
+ autofs_catatonic_mode(sbi);
return 0;
case AUTOFS_IOC_PROTOVER: /* Get protocol version */
- return autofs4_get_protover(sbi, p);
+ return autofs_get_protover(sbi, p);
case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */
- return autofs4_get_protosubver(sbi, p);
+ return autofs_get_protosubver(sbi, p);
case AUTOFS_IOC_SETTIMEOUT:
- return autofs4_get_set_timeout(sbi, p);
+ return autofs_get_set_timeout(sbi, p);
#ifdef CONFIG_COMPAT
case AUTOFS_IOC_SETTIMEOUT32:
- return autofs4_compat_get_set_timeout(sbi, p);
+ return autofs_compat_get_set_timeout(sbi, p);
#endif
case AUTOFS_IOC_ASKUMOUNT:
- return autofs4_ask_umount(filp->f_path.mnt, p);
+ return autofs_ask_umount(filp->f_path.mnt, p);
/* return a single thing to expire */
case AUTOFS_IOC_EXPIRE:
- return autofs4_expire_run(inode->i_sb,
- filp->f_path.mnt, sbi, p);
+ return autofs_expire_run(inode->i_sb, filp->f_path.mnt, sbi, p);
/* same as above, but can send multiple expires through pipe */
case AUTOFS_IOC_EXPIRE_MULTI:
- return autofs4_expire_multi(inode->i_sb,
- filp->f_path.mnt, sbi, p);
+ return autofs_expire_multi(inode->i_sb,
+ filp->f_path.mnt, sbi, p);
default:
return -EINVAL;
}
}
-static long autofs4_root_ioctl(struct file *filp,
+static long autofs_root_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
- return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+ return autofs_root_ioctl_unlocked(inode, filp, cmd, arg);
}
#ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *filp,
+static long autofs_root_compat_ioctl(struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
int ret;
if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL)
- ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+ ret = autofs_root_ioctl_unlocked(inode, filp, cmd, arg);
else
- ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
+ ret = autofs_root_ioctl_unlocked(inode, filp, cmd,
(unsigned long) compat_ptr(arg));
return ret;
diff --git a/fs/autofs4/symlink.c b/fs/autofs/symlink.c
index ab0b4285a202..aad3902c0cc1 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs/symlink.c
@@ -8,22 +8,22 @@
#include "autofs_i.h"
-static const char *autofs4_get_link(struct dentry *dentry,
- struct inode *inode,
- struct delayed_call *done)
+static const char *autofs_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
struct autofs_sb_info *sbi;
struct autofs_info *ino;
if (!dentry)
return ERR_PTR(-ECHILD);
- sbi = autofs4_sbi(dentry->d_sb);
- ino = autofs4_dentry_ino(dentry);
- if (ino && !autofs4_oz_mode(sbi))
+ sbi = autofs_sbi(dentry->d_sb);
+ ino = autofs_dentry_ino(dentry);
+ if (ino && !autofs_oz_mode(sbi))
ino->last_used = jiffies;
return d_inode(dentry)->i_private;
}
-const struct inode_operations autofs4_symlink_inode_operations = {
- .get_link = autofs4_get_link
+const struct inode_operations autofs_symlink_inode_operations = {
+ .get_link = autofs_get_link
};
diff --git a/fs/autofs4/waitq.c b/fs/autofs/waitq.c
index be9c3dc048ab..f6385c6ef0a5 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs/waitq.c
@@ -7,19 +7,15 @@
* option, any later version, incorporated herein by reference.
*/
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/signal.h>
#include <linux/sched/signal.h>
-#include <linux/file.h>
#include "autofs_i.h"
/* We make this a static variable rather than a part of the superblock; it
* is better if we don't reassign numbers easily even across filesystems
*/
-static autofs_wqt_t autofs4_next_wait_queue = 1;
+static autofs_wqt_t autofs_next_wait_queue = 1;
-void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
+void autofs_catatonic_mode(struct autofs_sb_info *sbi)
{
struct autofs_wait_queue *wq, *nwq;
@@ -49,8 +45,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
mutex_unlock(&sbi->wq_mutex);
}
-static int autofs4_write(struct autofs_sb_info *sbi,
- struct file *file, const void *addr, int bytes)
+static int autofs_write(struct autofs_sb_info *sbi,
+ struct file *file, const void *addr, int bytes)
{
unsigned long sigpipe, flags;
const char *data = (const char *)addr;
@@ -82,7 +78,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
return bytes == 0 ? 0 : wr < 0 ? wr : -EIO;
}
-static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
+static void autofs_notify_daemon(struct autofs_sb_info *sbi,
struct autofs_wait_queue *wq,
int type)
{
@@ -167,23 +163,23 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
mutex_unlock(&sbi->wq_mutex);
- switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) {
+ switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) {
case 0:
break;
case -ENOMEM:
case -ERESTARTSYS:
/* Just fail this one */
- autofs4_wait_release(sbi, wq->wait_queue_token, ret);
+ autofs_wait_release(sbi, wq->wait_queue_token, ret);
break;
default:
- autofs4_catatonic_mode(sbi);
+ autofs_catatonic_mode(sbi);
break;
}
fput(pipe);
}
-static int autofs4_getpath(struct autofs_sb_info *sbi,
- struct dentry *dentry, char **name)
+static int autofs_getpath(struct autofs_sb_info *sbi,
+ struct dentry *dentry, char *name)
{
struct dentry *root = sbi->sb->s_root;
struct dentry *tmp;
@@ -193,7 +189,7 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
unsigned seq;
rename_retry:
- buf = *name;
+ buf = name;
len = 0;
seq = read_seqbegin(&rename_lock);
@@ -228,7 +224,7 @@ rename_retry:
}
static struct autofs_wait_queue *
-autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr)
+autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr)
{
struct autofs_wait_queue *wq;
@@ -263,7 +259,7 @@ static int validate_request(struct autofs_wait_queue **wait,
return -ENOENT;
/* Wait in progress, continue; */
- wq = autofs4_find_wait(sbi, qstr);
+ wq = autofs_find_wait(sbi, qstr);
if (wq) {
*wait = wq;
return 1;
@@ -272,7 +268,7 @@ static int validate_request(struct autofs_wait_queue **wait,
*wait = NULL;
/* If we don't yet have any info this is a new request */
- ino = autofs4_dentry_ino(dentry);
+ ino = autofs_dentry_ino(dentry);
if (!ino)
return 1;
@@ -297,7 +293,7 @@ static int validate_request(struct autofs_wait_queue **wait,
if (sbi->catatonic)
return -ENOENT;
- wq = autofs4_find_wait(sbi, qstr);
+ wq = autofs_find_wait(sbi, qstr);
if (wq) {
*wait = wq;
return 1;
@@ -351,7 +347,7 @@ static int validate_request(struct autofs_wait_queue **wait,
return 1;
}
-int autofs4_wait(struct autofs_sb_info *sbi,
+int autofs_wait(struct autofs_sb_info *sbi,
const struct path *path, enum autofs_notify notify)
{
struct dentry *dentry = path->dentry;
@@ -399,7 +395,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
qstr.len = sprintf(name, "%p", dentry);
else {
- qstr.len = autofs4_getpath(sbi, dentry, &name);
+ qstr.len = autofs_getpath(sbi, dentry, name);
if (!qstr.len) {
kfree(name);
return -ENOENT;
@@ -430,15 +426,15 @@ int autofs4_wait(struct autofs_sb_info *sbi,
return -ENOMEM;
}
- wq->wait_queue_token = autofs4_next_wait_queue;
- if (++autofs4_next_wait_queue == 0)
- autofs4_next_wait_queue = 1;
+ wq->wait_queue_token = autofs_next_wait_queue;
+ if (++autofs_next_wait_queue == 0)
+ autofs_next_wait_queue = 1;
wq->next = sbi->queues;
sbi->queues = wq;
init_waitqueue_head(&wq->queue);
memcpy(&wq->name, &qstr, sizeof(struct qstr));
- wq->dev = autofs4_get_dev(sbi);
- wq->ino = autofs4_get_ino(sbi);
+ wq->dev = autofs_get_dev(sbi);
+ wq->ino = autofs_get_ino(sbi);
wq->uid = current_uid();
wq->gid = current_gid();
wq->pid = pid;
@@ -467,9 +463,9 @@ int autofs4_wait(struct autofs_sb_info *sbi,
wq->name.name, notify);
/*
- * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
+ * autofs_notify_daemon() may block; it will unlock ->wq_mutex
*/
- autofs4_notify_daemon(sbi, wq, type);
+ autofs_notify_daemon(sbi, wq, type);
} else {
wq->wait_ctr++;
pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n",
@@ -500,12 +496,12 @@ int autofs4_wait(struct autofs_sb_info *sbi,
struct dentry *de = NULL;
/* direct mount or browsable map */
- ino = autofs4_dentry_ino(dentry);
+ ino = autofs_dentry_ino(dentry);
if (!ino) {
/* If not lookup actual dentry used */
de = d_lookup(dentry->d_parent, &dentry->d_name);
if (de)
- ino = autofs4_dentry_ino(de);
+ ino = autofs_dentry_ino(de);
}
/* Set mount requester */
@@ -530,7 +526,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
}
-int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status)
+int autofs_wait_release(struct autofs_sb_info *sbi,
+ autofs_wqt_t wait_queue_token, int status)
{
struct autofs_wait_queue *wq, **wql;
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
index 44727bf18297..99fda4d6da25 100644
--- a/fs/autofs4/Kconfig
+++ b/fs/autofs4/Kconfig
@@ -1,5 +1,7 @@
config AUTOFS4_FS
- tristate "Kernel automounter version 4 support (also supports v3)"
+ tristate "Kernel automounter version 4 support (also supports v3 and v5)"
+ default n
+ depends on AUTOFS_FS = n
help
The automounter is a tool to automatically mount remote file systems
on demand. This implementation is partially kernel-based to reduce
@@ -7,14 +9,38 @@ config AUTOFS4_FS
automounter (amd), which is a pure user space daemon.
To use the automounter you need the user-space tools from
- <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also
- want to answer Y to "NFS file system support", below.
+ <https://www.kernel.org/pub/linux/daemons/autofs/>; you also want
+ to answer Y to "NFS file system support", below.
- To compile this support as a module, choose M here: the module will be
- called autofs4. You will need to add "alias autofs autofs4" to your
- modules configuration file.
+ This module is in the process of being renamed from autofs4 to
+ autofs. Since autofs is now the only module that provides the
+ autofs file system the module is not version 4 specific.
- If you are not a part of a fairly large, distributed network or
- don't have a laptop which needs to dynamically reconfigure to the
- local network, you probably do not need an automounter, and can say
- N here.
+ The autofs4 module is now built from the source located in
+ fs/autofs. The autofs4 directory and its configuration entry
+ will be removed two kernel versions from the inclusion of this
+ change.
+
+ Changes that will need to be made should be limited to:
+ - source include statments should be changed from autofs_fs4.h to
+ autofs_fs.h since these two header files have been merged.
+ - user space scripts that manually load autofs4.ko should be
+ changed to load autofs.ko. But since the module directory name
+ and the module name are the same as the file system name there
+ is no need to manually load module.
+ - any "alias autofs autofs4" will need to be removed.
+ - due to the autofs4 module directory name not being the same as
+ its file system name autoloading didn't work properly. Because
+ of this kernel configurations would often build the module into
+ the kernel. This may have resulted in selinux policies that will
+ prevent the autofs module from autoloading and will need to be
+ updated.
+
+ Please configure AUTOFS_FS instead of AUTOFS4_FS from now on.
+
+ NOTE: Since the modules autofs and autofs4 use the same file system
+ type name of "autofs" only one can be built. The "depends"
+ above will result in AUTOFS4_FS not appearing in .config for
+ any setting of AUTOFS_FS other than n and AUTOFS4_FS will
+ appear under the AUTOFS_FS entry otherwise which is intended
+ to draw attention to the module rename change.
diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile
index a811c1f7d9ab..417dd726d9ef 100644
--- a/fs/autofs4/Makefile
+++ b/fs/autofs4/Makefile
@@ -4,4 +4,6 @@
obj-$(CONFIG_AUTOFS4_FS) += autofs4.o
-autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o
+autofs4-objs := ../autofs/init.o ../autofs/inode.o ../autofs/root.o \
+ ../autofs/symlink.o ../autofs/waitq.o ../autofs/expire.o \
+ ../autofs/dev-ioctl.o
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index a41b48f82a70..4de191563261 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -387,8 +387,13 @@ static Node *create_entry(const char __user *buffer, size_t count)
s = strchr(p, del);
if (!s)
goto einval;
- *s++ = '\0';
- e->offset = simple_strtoul(p, &p, 10);
+ *s = '\0';
+ if (p != s) {
+ int r = kstrtoint(p, 10, &e->offset);
+ if (r != 0 || e->offset < 0)
+ goto einval;
+ }
+ p = s;
if (*p++)
goto einval;
pr_debug("register: offset: %#x\n", e->offset);
@@ -428,7 +433,8 @@ static Node *create_entry(const char __user *buffer, size_t count)
if (e->mask &&
string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
goto einval;
- if (e->size + e->offset > BINPRM_BUF_SIZE)
+ if (e->size > BINPRM_BUF_SIZE ||
+ BINPRM_BUF_SIZE - e->size < e->offset)
goto einval;
pr_debug("register: magic/mask length: %i\n", e->size);
if (USE_DEBUG) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index bef6934b6189..05e12aea2404 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -216,6 +216,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
bio.bi_end_io = blkdev_bio_end_io_simple;
+ bio.bi_ioprio = iocb->ki_ioprio;
ret = bio_iov_iter_get_pages(&bio, iter);
if (unlikely(ret))
@@ -355,6 +356,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
+ bio->bi_ioprio = iocb->ki_ioprio;
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 249b83fafe48..cabc045f483d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3427,120 +3427,6 @@ int bh_submit_read(struct buffer_head *bh)
}
EXPORT_SYMBOL(bh_submit_read);
-/*
- * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
- *
- * Returns the offset within the file on success, and -ENOENT otherwise.
- */
-static loff_t
-page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
-{
- loff_t offset = page_offset(page);
- struct buffer_head *bh, *head;
- bool seek_data = whence == SEEK_DATA;
-
- if (lastoff < offset)
- lastoff = offset;
-
- bh = head = page_buffers(page);
- do {
- offset += bh->b_size;
- if (lastoff >= offset)
- continue;
-
- /*
- * Unwritten extents that have data in the page cache covering
- * them can be identified by the BH_Unwritten state flag.
- * Pages with multiple buffers might have a mix of holes, data
- * and unwritten extents - any buffer with valid data in it
- * should have BH_Uptodate flag set on it.
- */
-
- if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data)
- return lastoff;
-
- lastoff = offset;
- } while ((bh = bh->b_this_page) != head);
- return -ENOENT;
-}
-
-/*
- * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
- *
- * Within unwritten extents, the page cache determines which parts are holes
- * and which are data: unwritten and uptodate buffer heads count as data;
- * everything else counts as a hole.
- *
- * Returns the resulting offset on successs, and -ENOENT otherwise.
- */
-loff_t
-page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
- int whence)
-{
- pgoff_t index = offset >> PAGE_SHIFT;
- pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
- loff_t lastoff = offset;
- struct pagevec pvec;
-
- if (length <= 0)
- return -ENOENT;
-
- pagevec_init(&pvec);
-
- do {
- unsigned nr_pages, i;
-
- nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
- end - 1);
- if (nr_pages == 0)
- break;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
-
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL), or
- * even swizzled back from swapper_space to tmpfs file
- * mapping. However, page->index will not change
- * because we have a reference on the page.
- *
- * If current page offset is beyond where we've ended,
- * we've found a hole.
- */
- if (whence == SEEK_HOLE &&
- lastoff < page_offset(page))
- goto check_range;
-
- lock_page(page);
- if (likely(page->mapping == inode->i_mapping) &&
- page_has_buffers(page)) {
- lastoff = page_seek_hole_data(page, lastoff, whence);
- if (lastoff >= 0) {
- unlock_page(page);
- goto check_range;
- }
- }
- unlock_page(page);
- lastoff = page_offset(page) + PAGE_SIZE;
- }
- pagevec_release(&pvec);
- } while (index < end);
-
- /* When no page at lastoff and we are not done, we found a hole. */
- if (whence != SEEK_HOLE)
- goto not_found;
-
-check_range:
- if (lastoff < offset + length)
- goto out;
-not_found:
- lastoff = -ENOENT;
-out:
- pagevec_release(&pvec);
- return lastoff;
-}
-
void __init buffer_init(void)
{
unsigned long nrpages;
diff --git a/fs/compat.c b/fs/compat.c
index 190b38b39d9e..4a0aaaf53217 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -16,79 +16,12 @@
*/
#include <linux/compat.h>
-#include <linux/ncp_mount.h>
#include <linux/nfs4_mount.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include "internal.h"
-struct compat_ncp_mount_data {
- compat_int_t version;
- compat_uint_t ncp_fd;
- __compat_uid_t mounted_uid;
- compat_pid_t wdog_pid;
- unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
- compat_uint_t time_out;
- compat_uint_t retry_count;
- compat_uint_t flags;
- __compat_uid_t uid;
- __compat_gid_t gid;
- compat_mode_t file_mode;
- compat_mode_t dir_mode;
-};
-
-struct compat_ncp_mount_data_v4 {
- compat_int_t version;
- compat_ulong_t flags;
- compat_ulong_t mounted_uid;
- compat_long_t wdog_pid;
- compat_uint_t ncp_fd;
- compat_uint_t time_out;
- compat_uint_t retry_count;
- compat_ulong_t uid;
- compat_ulong_t gid;
- compat_ulong_t file_mode;
- compat_ulong_t dir_mode;
-};
-
-static void *do_ncp_super_data_conv(void *raw_data)
-{
- int version = *(unsigned int *)raw_data;
-
- if (version == 3) {
- struct compat_ncp_mount_data *c_n = raw_data;
- struct ncp_mount_data *n = raw_data;
-
- n->dir_mode = c_n->dir_mode;
- n->file_mode = c_n->file_mode;
- n->gid = c_n->gid;
- n->uid = c_n->uid;
- memmove (n->mounted_vol, c_n->mounted_vol, (sizeof (c_n->mounted_vol) + 3 * sizeof (unsigned int)));
- n->wdog_pid = c_n->wdog_pid;
- n->mounted_uid = c_n->mounted_uid;
- } else if (version == 4) {
- struct compat_ncp_mount_data_v4 *c_n = raw_data;
- struct ncp_mount_data_v4 *n = raw_data;
-
- n->dir_mode = c_n->dir_mode;
- n->file_mode = c_n->file_mode;
- n->gid = c_n->gid;
- n->uid = c_n->uid;
- n->retry_count = c_n->retry_count;
- n->time_out = c_n->time_out;
- n->ncp_fd = c_n->ncp_fd;
- n->wdog_pid = c_n->wdog_pid;
- n->mounted_uid = c_n->mounted_uid;
- n->flags = c_n->flags;
- } else if (version != 5) {
- return NULL;
- }
-
- return raw_data;
-}
-
-
struct compat_nfs_string {
compat_uint_t len;
compat_uptr_t data;
@@ -154,7 +87,6 @@ static int do_nfs4_super_data_conv(void *raw_data)
return 0;
}
-#define NCPFS_NAME "ncpfs"
#define NFS4_NAME "nfs4"
COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
@@ -183,9 +115,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
goto out2;
if (kernel_type && options) {
- if (!strcmp(kernel_type, NCPFS_NAME)) {
- do_ncp_super_data_conv(options);
- } else if (!strcmp(kernel_type, NFS4_NAME)) {
+ if (!strcmp(kernel_type, NFS4_NAME)) {
retval = -EINVAL;
if (do_nfs4_super_data_conv(options))
goto out3;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index ef80085ed564..9907475b4226 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -38,8 +38,6 @@
#include <linux/ppp-ioctl.h>
#include <linux/if_pppox.h>
#include <linux/mtio.h>
-#include <linux/auto_fs.h>
-#include <linux/auto_fs4.h>
#include <linux/tty.h>
#include <linux/vt_kern.h>
#include <linux/fb.h>
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ce654526c0fb..243a269e6c5f 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -156,12 +156,8 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
}
req = skcipher_request_alloc(tfm, gfp_flags);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n",
- __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
@@ -178,9 +174,10 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_skcipher_encrypt() returned %d\n",
- __func__, res);
+ fscrypt_err(inode->i_sb,
+ "%scryption failed for inode %lu, block %llu: %d",
+ (rw == FS_DECRYPT ? "de" : "en"),
+ inode->i_ino, lblk_num, res);
return res;
}
return 0;
@@ -326,7 +323,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return 0;
}
- /* this should eventually be an flag in d_flags */
spin_lock(&dentry->d_lock);
cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
spin_unlock(&dentry->d_lock);
@@ -353,7 +349,6 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
const struct dentry_operations fscrypt_d_ops = {
.d_revalidate = fscrypt_d_revalidate,
};
-EXPORT_SYMBOL(fscrypt_d_ops);
void fscrypt_restore_control_page(struct page *page)
{
@@ -422,13 +417,43 @@ fail:
return res;
}
+void fscrypt_msg(struct super_block *sb, const char *level,
+ const char *fmt, ...)
+{
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&rs))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (sb)
+ printk("%sfscrypt (%s): %pV\n", level, sb->s_id, &vaf);
+ else
+ printk("%sfscrypt: %pV\n", level, &vaf);
+ va_end(args);
+}
+
/**
* fscrypt_init() - Set up for fs encryption.
*/
static int __init fscrypt_init(void)
{
+ /*
+ * Use an unbound workqueue to allow bios to be decrypted in parallel
+ * even when they happen to complete on the same CPU. This sacrifices
+ * locality, but it's worthwhile since decryption is CPU-intensive.
+ *
+ * Also use a high-priority workqueue to prioritize decryption work,
+ * which blocks reads from completing, over regular application tasks.
+ */
fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
- WQ_HIGHPRI, 0);
+ WQ_UNBOUND | WQ_HIGHPRI,
+ num_online_cpus());
if (!fscrypt_read_workqueue)
goto fail;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index e33f3d3c5ade..d7a0f682ca12 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -59,11 +59,8 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
/* Set up the encryption request */
req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: skcipher_request_alloc() failed\n", __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
@@ -74,8 +71,9 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname,
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
- printk_ratelimited(KERN_ERR
- "%s: Error (error code %d)\n", __func__, res);
+ fscrypt_err(inode->i_sb,
+ "Filename encryption failed for inode %lu: %d",
+ inode->i_ino, res);
return res;
}
@@ -96,23 +94,14 @@ static int fname_decrypt(struct inode *inode,
struct skcipher_request *req = NULL;
DECLARE_CRYPTO_WAIT(wait);
struct scatterlist src_sg, dst_sg;
- struct fscrypt_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_ctfm;
+ struct crypto_skcipher *tfm = inode->i_crypt_info->ci_ctfm;
int res = 0;
char iv[FS_CRYPTO_BLOCK_SIZE];
- unsigned lim;
-
- lim = inode->i_sb->s_cop->max_namelen(inode);
- if (iname->len <= 0 || iname->len > lim)
- return -EIO;
/* Allocate request */
req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req) {
- printk_ratelimited(KERN_ERR
- "%s: crypto_request_alloc() failed\n", __func__);
+ if (!req)
return -ENOMEM;
- }
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
@@ -127,8 +116,9 @@ static int fname_decrypt(struct inode *inode,
res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
skcipher_request_free(req);
if (res < 0) {
- printk_ratelimited(KERN_ERR
- "%s: Error (error code %d)\n", __func__, res);
+ fscrypt_err(inode->i_sb,
+ "Filename decryption failed for inode %lu: %d",
+ inode->i_ino, res);
return res;
}
@@ -341,12 +331,12 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return 0;
}
ret = fscrypt_get_encryption_info(dir);
- if (ret && ret != -EOPNOTSUPP)
+ if (ret)
return ret;
if (dir->i_crypt_info) {
if (!fscrypt_fname_encrypted_size(dir, iname->len,
- dir->i_sb->s_cop->max_namelen(dir),
+ dir->i_sb->s_cop->max_namelen,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index ad6722bae8b7..37562394c5de 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -18,15 +18,7 @@
/* Encryption parameters */
#define FS_IV_SIZE 16
-#define FS_AES_128_ECB_KEY_SIZE 16
-#define FS_AES_128_CBC_KEY_SIZE 16
-#define FS_AES_128_CTS_KEY_SIZE 16
-#define FS_AES_256_GCM_KEY_SIZE 32
-#define FS_AES_256_CBC_KEY_SIZE 32
-#define FS_AES_256_CTS_KEY_SIZE 32
-#define FS_AES_256_XTS_KEY_SIZE 64
-
-#define FS_KEY_DERIVATION_NONCE_SIZE 16
+#define FS_KEY_DERIVATION_NONCE_SIZE 16
/**
* Encryption context for inode
@@ -91,6 +83,10 @@ static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
return true;
+ if (contents_mode == FS_ENCRYPTION_MODE_SPECK128_256_XTS &&
+ filenames_mode == FS_ENCRYPTION_MODE_SPECK128_256_CTS)
+ return true;
+
return false;
}
@@ -106,6 +102,15 @@ extern int fscrypt_do_page_crypto(const struct inode *inode,
gfp_t gfp_flags);
extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
gfp_t gfp_flags);
+extern const struct dentry_operations fscrypt_d_ops;
+
+extern void __printf(3, 4) __cold
+fscrypt_msg(struct super_block *sb, const char *level, const char *fmt, ...);
+
+#define fscrypt_warn(sb, fmt, ...) \
+ fscrypt_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define fscrypt_err(sb, fmt, ...) \
+ fscrypt_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
/* fname.c */
extern int fname_encrypt(struct inode *inode, const struct qstr *iname,
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index bec06490fb13..926e5df20ec3 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -39,8 +39,9 @@ int fscrypt_file_open(struct inode *inode, struct file *filp)
dir = dget_parent(file_dentry(filp));
if (IS_ENCRYPTED(d_inode(dir)) &&
!fscrypt_has_permitted_context(d_inode(dir), inode)) {
- pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu",
- d_inode(dir)->i_ino, inode->i_ino);
+ fscrypt_warn(inode->i_sb,
+ "inconsistent encryption contexts: %lu/%lu",
+ d_inode(dir)->i_ino, inode->i_ino);
err = -EPERM;
}
dput(dir);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 05f5ee1f0705..e997ca51192f 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -19,17 +19,16 @@
static struct crypto_shash *essiv_hash_tfm;
-/**
- * derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivation.
- * @source_key: Source key to which to apply derivation.
- * @derived_raw_key: Derived raw key.
+/*
+ * Key derivation function. This generates the derived key by encrypting the
+ * master key with AES-128-ECB using the inode's nonce as the AES key.
*
- * Return: Zero on success; non-zero otherwise.
+ * The master key must be at least as long as the derived key. If the master
+ * key is longer, then only the first 'derived_keysize' bytes are used.
*/
-static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
- const struct fscrypt_key *source_key,
- u8 derived_raw_key[FS_MAX_KEY_SIZE])
+static int derive_key_aes(const u8 *master_key,
+ const struct fscrypt_context *ctx,
+ u8 *derived_key, unsigned int derived_keysize)
{
int res = 0;
struct skcipher_request *req = NULL;
@@ -51,14 +50,13 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
skcipher_request_set_callback(req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
crypto_req_done, &wait);
- res = crypto_skcipher_setkey(tfm, deriving_key,
- FS_AES_128_ECB_KEY_SIZE);
+ res = crypto_skcipher_setkey(tfm, ctx->nonce, sizeof(ctx->nonce));
if (res < 0)
goto out;
- sg_init_one(&src_sg, source_key->raw, source_key->size);
- sg_init_one(&dst_sg, derived_raw_key, source_key->size);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
+ sg_init_one(&src_sg, master_key, derived_keysize);
+ sg_init_one(&dst_sg, derived_key, derived_keysize);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
NULL);
res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
out:
@@ -67,101 +65,147 @@ out:
return res;
}
-static int validate_user_key(struct fscrypt_info *crypt_info,
- struct fscrypt_context *ctx, u8 *raw_key,
- const char *prefix, int min_keysize)
+/*
+ * Search the current task's subscribed keyrings for a "logon" key with
+ * description prefix:descriptor, and if found acquire a read lock on it and
+ * return a pointer to its validated payload in *payload_ret.
+ */
+static struct key *
+find_and_lock_process_key(const char *prefix,
+ const u8 descriptor[FS_KEY_DESCRIPTOR_SIZE],
+ unsigned int min_keysize,
+ const struct fscrypt_key **payload_ret)
{
char *description;
- struct key *keyring_key;
- struct fscrypt_key *master_key;
+ struct key *key;
const struct user_key_payload *ukp;
- int res;
+ const struct fscrypt_key *payload;
description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
- FS_KEY_DESCRIPTOR_SIZE,
- ctx->master_key_descriptor);
+ FS_KEY_DESCRIPTOR_SIZE, descriptor);
if (!description)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- keyring_key = request_key(&key_type_logon, description, NULL);
+ key = request_key(&key_type_logon, description, NULL);
kfree(description);
- if (IS_ERR(keyring_key))
- return PTR_ERR(keyring_key);
- down_read(&keyring_key->sem);
-
- if (keyring_key->type != &key_type_logon) {
- printk_once(KERN_WARNING
- "%s: key type must be logon\n", __func__);
- res = -ENOKEY;
- goto out;
- }
- ukp = user_key_payload_locked(keyring_key);
- if (!ukp) {
- /* key was revoked before we acquired its semaphore */
- res = -EKEYREVOKED;
- goto out;
+ if (IS_ERR(key))
+ return key;
+
+ down_read(&key->sem);
+ ukp = user_key_payload_locked(key);
+
+ if (!ukp) /* was the key revoked before we acquired its semaphore? */
+ goto invalid;
+
+ payload = (const struct fscrypt_key *)ukp->data;
+
+ if (ukp->datalen != sizeof(struct fscrypt_key) ||
+ payload->size < 1 || payload->size > FS_MAX_KEY_SIZE) {
+ fscrypt_warn(NULL,
+ "key with description '%s' has invalid payload",
+ key->description);
+ goto invalid;
}
- if (ukp->datalen != sizeof(struct fscrypt_key)) {
- res = -EINVAL;
- goto out;
+
+ if (payload->size < min_keysize) {
+ fscrypt_warn(NULL,
+ "key with description '%s' is too short (got %u bytes, need %u+ bytes)",
+ key->description, payload->size, min_keysize);
+ goto invalid;
}
- master_key = (struct fscrypt_key *)ukp->data;
- BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
-
- if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE
- || master_key->size % AES_BLOCK_SIZE != 0) {
- printk_once(KERN_WARNING
- "%s: key size incorrect: %d\n",
- __func__, master_key->size);
- res = -ENOKEY;
- goto out;
+
+ *payload_ret = payload;
+ return key;
+
+invalid:
+ up_read(&key->sem);
+ key_put(key);
+ return ERR_PTR(-ENOKEY);
+}
+
+/* Find the master key, then derive the inode's actual encryption key */
+static int find_and_derive_key(const struct inode *inode,
+ const struct fscrypt_context *ctx,
+ u8 *derived_key, unsigned int derived_keysize)
+{
+ struct key *key;
+ const struct fscrypt_key *payload;
+ int err;
+
+ key = find_and_lock_process_key(FS_KEY_DESC_PREFIX,
+ ctx->master_key_descriptor,
+ derived_keysize, &payload);
+ if (key == ERR_PTR(-ENOKEY) && inode->i_sb->s_cop->key_prefix) {
+ key = find_and_lock_process_key(inode->i_sb->s_cop->key_prefix,
+ ctx->master_key_descriptor,
+ derived_keysize, &payload);
}
- res = derive_key_aes(ctx->nonce, master_key, raw_key);
-out:
- up_read(&keyring_key->sem);
- key_put(keyring_key);
- return res;
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ err = derive_key_aes(payload->raw, ctx, derived_key, derived_keysize);
+ up_read(&key->sem);
+ key_put(key);
+ return err;
}
-static const struct {
+static struct fscrypt_mode {
+ const char *friendly_name;
const char *cipher_str;
int keysize;
+ bool logged_impl_name;
} available_modes[] = {
- [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)",
- FS_AES_256_XTS_KEY_SIZE },
- [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))",
- FS_AES_256_CTS_KEY_SIZE },
- [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)",
- FS_AES_128_CBC_KEY_SIZE },
- [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))",
- FS_AES_128_CTS_KEY_SIZE },
+ [FS_ENCRYPTION_MODE_AES_256_XTS] = {
+ .friendly_name = "AES-256-XTS",
+ .cipher_str = "xts(aes)",
+ .keysize = 64,
+ },
+ [FS_ENCRYPTION_MODE_AES_256_CTS] = {
+ .friendly_name = "AES-256-CTS-CBC",
+ .cipher_str = "cts(cbc(aes))",
+ .keysize = 32,
+ },
+ [FS_ENCRYPTION_MODE_AES_128_CBC] = {
+ .friendly_name = "AES-128-CBC",
+ .cipher_str = "cbc(aes)",
+ .keysize = 16,
+ },
+ [FS_ENCRYPTION_MODE_AES_128_CTS] = {
+ .friendly_name = "AES-128-CTS-CBC",
+ .cipher_str = "cts(cbc(aes))",
+ .keysize = 16,
+ },
+ [FS_ENCRYPTION_MODE_SPECK128_256_XTS] = {
+ .friendly_name = "Speck128/256-XTS",
+ .cipher_str = "xts(speck128)",
+ .keysize = 64,
+ },
+ [FS_ENCRYPTION_MODE_SPECK128_256_CTS] = {
+ .friendly_name = "Speck128/256-CTS-CBC",
+ .cipher_str = "cts(cbc(speck128))",
+ .keysize = 32,
+ },
};
-static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode,
- const char **cipher_str_ret, int *keysize_ret)
+static struct fscrypt_mode *
+select_encryption_mode(const struct fscrypt_info *ci, const struct inode *inode)
{
- u32 mode;
-
if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) {
- pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n",
- inode->i_ino,
- ci->ci_data_mode, ci->ci_filename_mode);
- return -EINVAL;
+ fscrypt_warn(inode->i_sb,
+ "inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)",
+ inode->i_ino, ci->ci_data_mode,
+ ci->ci_filename_mode);
+ return ERR_PTR(-EINVAL);
}
- if (S_ISREG(inode->i_mode)) {
- mode = ci->ci_data_mode;
- } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) {
- mode = ci->ci_filename_mode;
- } else {
- WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
- inode->i_ino, (inode->i_mode & S_IFMT));
- return -EINVAL;
- }
+ if (S_ISREG(inode->i_mode))
+ return &available_modes[ci->ci_data_mode];
+
+ if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+ return &available_modes[ci->ci_filename_mode];
- *cipher_str_ret = available_modes[mode].cipher_str;
- *keysize_ret = available_modes[mode].keysize;
- return 0;
+ WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n",
+ inode->i_ino, (inode->i_mode & S_IFMT));
+ return ERR_PTR(-EINVAL);
}
static void put_crypt_info(struct fscrypt_info *ci)
@@ -184,8 +228,9 @@ static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt)
tfm = crypto_alloc_shash("sha256", 0, 0);
if (IS_ERR(tfm)) {
- pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n",
- PTR_ERR(tfm));
+ fscrypt_warn(NULL,
+ "error allocating SHA-256 transform: %ld",
+ PTR_ERR(tfm));
return PTR_ERR(tfm);
}
prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm);
@@ -245,8 +290,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
struct fscrypt_info *crypt_info;
struct fscrypt_context ctx;
struct crypto_skcipher *ctfm;
- const char *cipher_str;
- int keysize;
+ struct fscrypt_mode *mode;
u8 *raw_key = NULL;
int res;
@@ -290,57 +334,59 @@ int fscrypt_get_encryption_info(struct inode *inode)
memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
sizeof(crypt_info->ci_master_key));
- res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize);
- if (res)
+ mode = select_encryption_mode(crypt_info, inode);
+ if (IS_ERR(mode)) {
+ res = PTR_ERR(mode);
goto out;
+ }
/*
* This cannot be a stack buffer because it is passed to the scatterlist
* crypto API as part of key derivation.
*/
res = -ENOMEM;
- raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS);
+ raw_key = kmalloc(mode->keysize, GFP_NOFS);
if (!raw_key)
goto out;
- res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX,
- keysize);
- if (res && inode->i_sb->s_cop->key_prefix) {
- int res2 = validate_user_key(crypt_info, &ctx, raw_key,
- inode->i_sb->s_cop->key_prefix,
- keysize);
- if (res2) {
- if (res2 == -ENOKEY)
- res = -ENOKEY;
- goto out;
- }
- } else if (res) {
+ res = find_and_derive_key(inode, &ctx, raw_key, mode->keysize);
+ if (res)
goto out;
- }
- ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
- if (!ctfm || IS_ERR(ctfm)) {
- res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
- pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n",
- __func__, res, inode->i_ino);
+
+ ctfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+ if (IS_ERR(ctfm)) {
+ res = PTR_ERR(ctfm);
+ fscrypt_warn(inode->i_sb,
+ "error allocating '%s' transform for inode %lu: %d",
+ mode->cipher_str, inode->i_ino, res);
goto out;
}
+ if (unlikely(!mode->logged_impl_name)) {
+ /*
+ * fscrypt performance can vary greatly depending on which
+ * crypto algorithm implementation is used. Help people debug
+ * performance problems by logging the ->cra_driver_name the
+ * first time a mode is used. Note that multiple threads can
+ * race here, but it doesn't really matter.
+ */
+ mode->logged_impl_name = true;
+ pr_info("fscrypt: %s using implementation \"%s\"\n",
+ mode->friendly_name,
+ crypto_skcipher_alg(ctfm)->base.cra_driver_name);
+ }
crypt_info->ci_ctfm = ctfm;
- crypto_skcipher_clear_flags(ctfm, ~0);
crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY);
- /*
- * if the provided key is longer than keysize, we use the first
- * keysize bytes of the derived key only
- */
- res = crypto_skcipher_setkey(ctfm, raw_key, keysize);
+ res = crypto_skcipher_setkey(ctfm, raw_key, mode->keysize);
if (res)
goto out;
if (S_ISREG(inode->i_mode) &&
crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) {
- res = init_essiv_generator(crypt_info, raw_key, keysize);
+ res = init_essiv_generator(crypt_info, raw_key, mode->keysize);
if (res) {
- pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n",
- __func__, res, inode->i_ino);
+ fscrypt_warn(inode->i_sb,
+ "error initializing ESSIV generator for inode %lu: %d",
+ inode->i_ino, res);
goto out;
}
}
diff --git a/fs/dax.c b/fs/dax.c
index aa86d9f971a4..641192808bb6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
}
}
+static struct page *dax_busy_page(void *entry)
+{
+ unsigned long pfn;
+
+ for_each_mapped_pfn(entry, pfn) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (page_ref_count(page) > 1)
+ return page;
+ }
+ return NULL;
+}
+
/*
* Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't
@@ -492,6 +505,90 @@ restart:
return entry;
}
+/**
+ * dax_layout_busy_page - find first pinned page in @mapping
+ * @mapping: address space to scan for a page with ref count > 1
+ *
+ * DAX requires ZONE_DEVICE mapped pages. These pages are never
+ * 'onlined' to the page allocator so they are considered idle when
+ * page->count == 1. A filesystem uses this interface to determine if
+ * any page in the mapping is busy, i.e. for DMA, or other
+ * get_user_pages() usages.
+ *
+ * It is expected that the filesystem is holding locks to block the
+ * establishment of new mappings in this address_space. I.e. it expects
+ * to be able to run unmap_mapping_range() and subsequently not race
+ * mapping_mapped() becoming true.
+ */
+struct page *dax_layout_busy_page(struct address_space *mapping)
+{
+ pgoff_t indices[PAGEVEC_SIZE];
+ struct page *page = NULL;
+ struct pagevec pvec;
+ pgoff_t index, end;
+ unsigned i;
+
+ /*
+ * In the 'limited' case get_user_pages() for dax is disabled.
+ */
+ if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ return NULL;
+
+ if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+ return NULL;
+
+ pagevec_init(&pvec);
+ index = 0;
+ end = -1;
+
+ /*
+ * If we race get_user_pages_fast() here either we'll see the
+ * elevated page count in the pagevec_lookup and wait, or
+ * get_user_pages_fast() will see that the page it took a reference
+ * against is no longer mapped in the page tables and bail to the
+ * get_user_pages() slow path. The slow path is protected by
+ * pte_lock() and pmd_lock(). New references are not taken without
+ * holding those locks, and unmap_mapping_range() will not zero the
+ * pte or pmd without holding the respective lock, so we are
+ * guaranteed to either see new references or prevent new
+ * references from being established.
+ */
+ unmap_mapping_range(mapping, 0, 0, 1);
+
+ while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE),
+ indices)) {
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *pvec_ent = pvec.pages[i];
+ void *entry;
+
+ index = indices[i];
+ if (index >= end)
+ break;
+
+ if (!radix_tree_exceptional_entry(pvec_ent))
+ continue;
+
+ xa_lock_irq(&mapping->i_pages);
+ entry = get_unlocked_mapping_entry(mapping, index, NULL);
+ if (entry)
+ page = dax_busy_page(entry);
+ put_unlocked_mapping_entry(mapping, index, entry);
+ xa_unlock_irq(&mapping->i_pages);
+ if (page)
+ break;
+ }
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
+ index++;
+
+ if (page)
+ break;
+ }
+ return page;
+}
+EXPORT_SYMBOL_GPL(dax_layout_busy_page);
+
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
@@ -905,14 +1002,13 @@ out:
* If this page is ever written to we will re-fault and change the mapping to
* point to real DAX storage instead.
*/
-static int dax_load_hole(struct address_space *mapping, void *entry,
+static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
struct vm_fault *vmf)
{
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
- int ret = VM_FAULT_NOPAGE;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
struct page *zero_page;
- void *entry2;
pfn_t pfn;
zero_page = ZERO_PAGE(0);
@@ -922,14 +1018,9 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
}
pfn = page_to_pfn_t(zero_page);
- entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
- RADIX_DAX_ZERO_PAGE, false);
- if (IS_ERR(entry2)) {
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
-
- vm_insert_mixed(vmf->vma, vaddr, pfn);
+ dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
+ false);
+ ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
out:
trace_dax_load_hole(inode, vmf, ret);
return ret;
@@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
+ size_t xfer;
int id;
if (iov_iter_rw(iter) == READ) {
@@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* vfs_write(), depending on which operation we are doing.
*/
if (iov_iter_rw(iter) == WRITE)
- map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
+ xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
- map_len = copy_to_iter(kaddr, map_len, iter);
- if (map_len <= 0) {
- ret = map_len ? map_len : -EFAULT;
- break;
- }
+ xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
+ map_len, iter);
+
+ pos += xfer;
+ length -= xfer;
+ done += xfer;
- pos += map_len;
- length -= map_len;
- done += map_len;
+ if (xfer == 0)
+ ret = -EFAULT;
+ if (xfer < map_len)
+ break;
}
dax_read_unlock(id);
@@ -1112,7 +1206,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(dax_iomap_rw);
-static int dax_fault_return(int error)
+static vm_fault_t dax_fault_return(int error)
{
if (error == 0)
return VM_FAULT_NOPAGE;
@@ -1132,7 +1226,7 @@ static bool dax_fault_is_synchronous(unsigned long flags,
&& (iomap->flags & IOMAP_F_DIRTY);
}
-static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
@@ -1145,18 +1239,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
int error, major = 0;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
- int vmf_ret = 0;
+ vm_fault_t ret = 0;
void *entry;
pfn_t pfn;
- trace_dax_pte_fault(inode, vmf, vmf_ret);
+ trace_dax_pte_fault(inode, vmf, ret);
/*
* Check whether offset isn't beyond end of file now. Caller is supposed
* to hold locks serializing us with truncate / punch hole so this is
* a reliable test.
*/
if (pos >= i_size_read(inode)) {
- vmf_ret = VM_FAULT_SIGBUS;
+ ret = VM_FAULT_SIGBUS;
goto out;
}
@@ -1165,7 +1259,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
- vmf_ret = dax_fault_return(PTR_ERR(entry));
+ ret = dax_fault_return(PTR_ERR(entry));
goto out;
}
@@ -1176,7 +1270,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
* retried.
*/
if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
- vmf_ret = VM_FAULT_NOPAGE;
+ ret = VM_FAULT_NOPAGE;
goto unlock_entry;
}
@@ -1189,7 +1283,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if (iomap_errp)
*iomap_errp = error;
if (error) {
- vmf_ret = dax_fault_return(error);
+ ret = dax_fault_return(error);
goto unlock_entry;
}
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
@@ -1219,9 +1313,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto error_finish_iomap;
__SetPageUptodate(vmf->cow_page);
- vmf_ret = finish_fault(vmf);
- if (!vmf_ret)
- vmf_ret = VM_FAULT_DONE_COW;
+ ret = finish_fault(vmf);
+ if (!ret)
+ ret = VM_FAULT_DONE_COW;
goto finish_iomap;
}
@@ -1240,10 +1334,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
0, write && !sync);
- if (IS_ERR(entry)) {
- error = PTR_ERR(entry);
- goto error_finish_iomap;
- }
/*
* If we are doing synchronous page fault and inode needs fsync,
@@ -1257,23 +1347,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
goto error_finish_iomap;
}
*pfnp = pfn;
- vmf_ret = VM_FAULT_NEEDDSYNC | major;
+ ret = VM_FAULT_NEEDDSYNC | major;
goto finish_iomap;
}
trace_dax_insert_mapping(inode, vmf, entry);
if (write)
- error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
+ ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn);
else
- error = vm_insert_mixed(vma, vaddr, pfn);
+ ret = vmf_insert_mixed(vma, vaddr, pfn);
- /* -EBUSY is fine, somebody else faulted on the same PTE */
- if (error == -EBUSY)
- error = 0;
- break;
+ goto finish_iomap;
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (!write) {
- vmf_ret = dax_load_hole(mapping, entry, vmf);
+ ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap;
}
/*FALLTHRU*/
@@ -1284,12 +1371,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
}
error_finish_iomap:
- vmf_ret = dax_fault_return(error) | major;
+ ret = dax_fault_return(error);
finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
- if (vmf_ret & VM_FAULT_ERROR)
+ if (ret & VM_FAULT_ERROR)
copied = 0;
/*
* The fault is done by now and there's no way back (other
@@ -1302,12 +1389,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff);
out:
- trace_dax_pte_fault_done(inode, vmf, vmf_ret);
- return vmf_ret;
+ trace_dax_pte_fault_done(inode, vmf, ret);
+ return ret | major;
}
#ifdef CONFIG_FS_DAX_PMD
-static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
void *entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1327,8 +1414,6 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pfn = page_to_pfn_t(zero_page);
ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
- if (IS_ERR(ret))
- goto fallback;
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
@@ -1348,7 +1433,7 @@ fallback:
return VM_FAULT_FALLBACK;
}
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
struct vm_area_struct *vma = vmf->vma;
@@ -1358,7 +1443,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
bool sync;
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
- int result = VM_FAULT_FALLBACK;
+ vm_fault_t result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 };
pgoff_t max_pgoff, pgoff;
void *entry;
@@ -1450,8 +1535,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD, write && !sync);
- if (IS_ERR(entry))
- goto finish_iomap;
/*
* If we are doing synchronous page fault and inode needs fsync,
@@ -1509,7 +1592,7 @@ out:
return result;
}
#else
-static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops)
{
return VM_FAULT_FALLBACK;
@@ -1529,7 +1612,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* has done all the necessary locking for page fault to proceed
* successfully.
*/
-int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
{
switch (pe_size) {
@@ -1553,14 +1636,14 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
* DAX file. It takes care of marking corresponding radix tree entry as dirty
* as well.
*/
-static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
enum page_entry_size pe_size,
pfn_t pfn)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
- int vmf_ret, error;
+ vm_fault_t ret;
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
@@ -1579,21 +1662,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
xa_unlock_irq(&mapping->i_pages);
switch (pe_size) {
case PE_SIZE_PTE:
- error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- vmf_ret = dax_fault_return(error);
+ ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
break;
#ifdef CONFIG_FS_DAX_PMD
case PE_SIZE_PMD:
- vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+ ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, true);
break;
#endif
default:
- vmf_ret = VM_FAULT_FALLBACK;
+ ret = VM_FAULT_FALLBACK;
}
put_locked_mapping_entry(mapping, index);
- trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
- return vmf_ret;
+ trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
+ return ret;
}
/**
@@ -1606,8 +1688,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
* stored persistently on the media and handles inserting of appropriate page
* table entry.
*/
-int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
- pfn_t pfn)
+vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size, pfn_t pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 1f99678ff5d3..4fce1da7db23 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -796,19 +796,13 @@ EXPORT_SYMBOL_GPL(debugfs_read_file_bool);
ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
- char buf[32];
- size_t buf_size;
bool bv;
int r;
bool *val = file->private_data;
struct dentry *dentry = F_DENTRY(file);
- buf_size = min(count, (sizeof(buf)-1));
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
- if (strtobool(buf, &bv) == 0) {
+ r = kstrtobool_from_user(user_buf, count, &bv);
+ if (!r) {
r = debugfs_file_get(dentry);
if (unlikely(r))
return r;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 13b01351dd1c..a913b12fc7f8 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -512,7 +512,9 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
if (unlikely(!inode))
return failed_creating(dentry);
- inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ if (!parent)
+ parent = debugfs_mount->mnt_root;
+ inode->i_mode = S_IFDIR | ((d_inode(parent)->i_mode & 0770));
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
diff --git a/fs/exec.c b/fs/exec.c
index 2c3911612b22..2d4e0075bd24 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1706,14 +1706,13 @@ static int exec_binprm(struct linux_binprm *bprm)
/*
* sys_execve() executes a new program.
*/
-static int do_execveat_common(int fd, struct filename *filename,
- struct user_arg_ptr argv,
- struct user_arg_ptr envp,
- int flags)
+static int __do_execve_file(int fd, struct filename *filename,
+ struct user_arg_ptr argv,
+ struct user_arg_ptr envp,
+ int flags, struct file *file)
{
char *pathbuf = NULL;
struct linux_binprm *bprm;
- struct file *file;
struct files_struct *displaced;
int retval;
@@ -1752,7 +1751,8 @@ static int do_execveat_common(int fd, struct filename *filename,
check_unsafe_exec(bprm);
current->in_execve = 1;
- file = do_open_execat(fd, filename, flags);
+ if (!file)
+ file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;
@@ -1760,7 +1760,9 @@ static int do_execveat_common(int fd, struct filename *filename,
sched_exec();
bprm->file = file;
- if (fd == AT_FDCWD || filename->name[0] == '/') {
+ if (!filename) {
+ bprm->filename = "none";
+ } else if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
} else {
if (filename->name[0] == '\0')
@@ -1827,7 +1829,8 @@ static int do_execveat_common(int fd, struct filename *filename,
task_numa_free(current);
free_bprm(bprm);
kfree(pathbuf);
- putname(filename);
+ if (filename)
+ putname(filename);
if (displaced)
put_files_struct(displaced);
return retval;
@@ -1850,10 +1853,27 @@ out_files:
if (displaced)
reset_files_struct(displaced);
out_ret:
- putname(filename);
+ if (filename)
+ putname(filename);
return retval;
}
+static int do_execveat_common(int fd, struct filename *filename,
+ struct user_arg_ptr argv,
+ struct user_arg_ptr envp,
+ int flags)
+{
+ return __do_execve_file(fd, filename, argv, envp, flags, NULL);
+}
+
+int do_execve_file(struct file *file, void *__argv, void *__envp)
+{
+ struct user_arg_ptr argv = { .ptr.native = __argv };
+ struct user_arg_ptr envp = { .ptr.native = __envp };
+
+ return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
+}
+
int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index de1694512f1f..c09289a42dc5 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
- err = bdev_dax_supported(sb, blocksize);
- if (err) {
+ if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
ext2_msg(sb, KERN_ERR,
"DAX unsupported by block device. Turning off DAX.");
sbi->s_mount_opt &= ~EXT2_MOUNT_DAX;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 508b905d744d..b00481c475cb 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -185,25 +185,15 @@ static int ext4_init_block_bitmap(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
int flex_bg = 0;
- struct ext4_group_info *grp;
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
- grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, gdp);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT |
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return -EFSBADCRC;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -375,7 +365,6 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
- struct ext4_sb_info *sbi = EXT4_SB(sb);
if (buffer_verified(bh))
return 0;
@@ -387,10 +376,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EFSBADCRC;
}
blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
@@ -398,10 +385,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EFSCORRUPTED;
}
set_buffer_verified(bh);
@@ -436,6 +421,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
(bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
ext4_error(sb, "Invalid block bitmap block %llu in "
"block_group %u", bitmap_blk, block_group);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return ERR_PTR(-EFSCORRUPTED);
}
bh = sb_getblk(sb, bitmap_blk);
@@ -514,6 +501,8 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EIO;
}
clear_buffer_new(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 229ea4da6785..df95412915ea 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2530,6 +2530,9 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16]);
+extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
+ ext4_group_t block_group,
+ unsigned int flags);
extern __printf(4, 5)
void __ext4_error(struct super_block *, const char *, unsigned int,
@@ -2857,6 +2860,10 @@ struct ext4_group_info {
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3
+#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \
+ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
+#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \
+ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
#define EXT4_MB_GRP_NEED_INIT(grp) \
(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 763ef185dd17..c4e6fb15101b 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -162,8 +162,7 @@ int __init ext4_init_es(void)
void ext4_exit_es(void)
{
- if (ext4_es_cachep)
- kmem_cache_destroy(ext4_es_cachep);
+ kmem_cache_destroy(ext4_es_cachep);
}
void ext4_es_init_tree(struct ext4_es_tree *tree)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index fb6f023622fe..7f8023340eb8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -277,10 +277,11 @@ out:
}
#ifdef CONFIG_FS_DAX
-static int ext4_dax_huge_fault(struct vm_fault *vmf,
+static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
- int result, error = 0;
+ int error = 0;
+ vm_fault_t result;
int retries = 0;
handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -335,7 +336,7 @@ retry:
return result;
}
-static int ext4_dax_fault(struct vm_fault *vmf)
+static vm_fault_t ext4_dax_fault(struct vm_fault *vmf)
{
return ext4_dax_huge_fault(vmf, PE_SIZE_PTE);
}
@@ -380,50 +381,64 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static int ext4_file_open(struct inode * inode, struct file * filp)
+static int ext4_sample_last_mounted(struct super_block *sb,
+ struct vfsmount *mnt)
{
- struct super_block *sb = inode->i_sb;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct vfsmount *mnt = filp->f_path.mnt;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct path path;
char buf[64], *cp;
+ handle_t *handle;
+ int err;
+
+ if (likely(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED))
+ return 0;
+
+ if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
+ return 0;
+
+ sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
+ /*
+ * Sample where the filesystem has been mounted and
+ * store it in the superblock for sysadmin convenience
+ * when trying to sort through large numbers of block
+ * devices or filesystem images.
+ */
+ memset(buf, 0, sizeof(buf));
+ path.mnt = mnt;
+ path.dentry = mnt->mnt_root;
+ cp = d_path(&path, buf, sizeof(buf));
+ err = 0;
+ if (IS_ERR(cp))
+ goto out;
+
+ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+ err = PTR_ERR(handle);
+ if (IS_ERR(handle))
+ goto out;
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
+ if (err)
+ goto out_journal;
+ strlcpy(sbi->s_es->s_last_mounted, cp,
+ sizeof(sbi->s_es->s_last_mounted));
+ ext4_handle_dirty_super(handle, sb);
+out_journal:
+ ext4_journal_stop(handle);
+out:
+ sb_end_intwrite(sb);
+ return err;
+}
+
+static int ext4_file_open(struct inode * inode, struct file * filp)
+{
int ret;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
- !sb_rdonly(sb))) {
- sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
- /*
- * Sample where the filesystem has been mounted and
- * store it in the superblock for sysadmin convenience
- * when trying to sort through large numbers of block
- * devices or filesystem images.
- */
- memset(buf, 0, sizeof(buf));
- path.mnt = mnt;
- path.dentry = mnt->mnt_root;
- cp = d_path(&path, buf, sizeof(buf));
- if (!IS_ERR(cp)) {
- handle_t *handle;
- int err;
-
- handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- BUFFER_TRACE(sbi->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err) {
- ext4_journal_stop(handle);
- return err;
- }
- strlcpy(sbi->s_es->s_last_mounted, cp,
- sizeof(sbi->s_es->s_last_mounted));
- ext4_handle_dirty_super(handle, sb);
- ext4_journal_stop(handle);
- }
- }
+ ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
+ if (ret)
+ return ret;
ret = fscrypt_file_open(inode, filp);
if (ret)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index e871c4bf18e9..4b99e2db95b8 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -402,8 +402,8 @@ static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list)
}
/* Find all the fixed metadata in the filesystem. */
-int ext4_getfsmap_find_fixed_metadata(struct super_block *sb,
- struct list_head *meta_list)
+static int ext4_getfsmap_find_fixed_metadata(struct super_block *sb,
+ struct list_head *meta_list)
{
struct ext4_group_desc *gdp;
ext4_group_t agno;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df92e3ec9913..4d6e007f3569 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -83,7 +83,6 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
- struct ext4_sb_info *sbi = EXT4_SB(sb);
if (buffer_verified(bh))
return 0;
@@ -97,14 +96,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, blk);
- grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, desc);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return -EFSBADCRC;
}
set_buffer_verified(bh);
@@ -136,6 +129,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
(bitmap_blk >= ext4_blocks_count(sbi->s_es))) {
ext4_error(sb, "Invalid inode bitmap blk %llu in "
"block_group %u", bitmap_blk, block_group);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return ERR_PTR(-EFSCORRUPTED);
}
bh = sb_getblk(sb, bitmap_blk);
@@ -143,7 +138,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-ENOMEM);
}
if (bitmap_uptodate(bh))
goto verify;
@@ -190,6 +185,8 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
return ERR_PTR(-EIO);
}
@@ -337,13 +334,8 @@ out:
fatal = err;
} else {
ext4_error(sb, "bit already cleared for inode %lu", ino);
- if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, gdp);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
}
error_return:
@@ -914,6 +906,8 @@ repeat_in_this_group:
if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
ext4_error(sb, "reserved inode found cleared - "
"inode=%lu", ino + 1);
+ ext4_mark_group_bitmap_corrupted(sb, group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto next_group;
}
@@ -1105,6 +1099,8 @@ got:
err = -EIO;
ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
inode->i_ino);
+ ext4_mark_group_bitmap_corrupted(sb, group,
+ EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto out;
}
inode->i_generation = prandom_u32();
@@ -1206,11 +1202,8 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
- if (IS_ERR(bitmap_bh)) {
- ext4_error(sb, "inode bitmap error %ld for orphan %lu",
- ino, PTR_ERR(bitmap_bh));
+ if (IS_ERR(bitmap_bh))
return (struct inode *) bitmap_bh;
- }
/* Having the inode bit set should be a 100% indicator that this
* is a valid orphan (no e2fsck run on fs). Orphans also include
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index c32802c956d5..bf7fa1507e81 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -561,10 +561,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
int i;
- /* Count number blocks in a subtree under 'partial' */
- count = 1;
- for (i = 0; partial + i != chain + depth - 1; i++)
- count *= epb;
+ /*
+ * Count number blocks in a subtree under 'partial'. At each
+ * level we count number of complete empty subtrees beyond
+ * current offset and then descend into the subtree only
+ * partially beyond current offset.
+ */
+ count = 0;
+ for (i = partial - chain + 1; i < depth; i++)
+ count = count * epb + (epb - offsets[i] - 1);
+ count++;
/* Fill in size of a hole we found */
map->m_pblk = 0;
map->m_len = min_t(unsigned int, map->m_len, count);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 70cf4c7b268a..285ed1588730 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -144,6 +144,12 @@ int ext4_find_inline_data_nolock(struct inode *inode)
goto out;
if (!is.s.not_found) {
+ if (is.s.here->e_value_inum) {
+ EXT4_ERROR_INODE(inode, "inline data xattr refers "
+ "to an external xattr inode");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
(void *)ext4_raw_inode(&is.iloc));
EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
@@ -1835,8 +1841,8 @@ int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap)
iomap->offset = 0;
iomap->length = min_t(loff_t, ext4_get_inline_size(inode),
i_size_read(inode));
- iomap->type = 0;
- iomap->flags = IOMAP_F_DATA_INLINE;
+ iomap->type = IOMAP_INLINE;
+ iomap->flags = 0;
out:
up_read(&EXT4_I(inode)->xattr_sem);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1e50c5efae67..2ea07efbe016 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4298,28 +4298,28 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
EXT4_BLOCK_SIZE_BITS(sb);
stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
- /* If there are no blocks to remove, return now */
- if (first_block >= stop_block)
- goto out_stop;
+ /* If there are blocks to remove, do it */
+ if (stop_block > first_block) {
- down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode);
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
- ret = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ret = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
- else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ ret = ext4_ext_remove_space(inode, first_block,
+ stop_block - 1);
+ else
+ ret = ext4_ind_remove_space(handle, inode, first_block,
+ stop_block);
- up_write(&EXT4_I(inode)->i_data_sem);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
if (IS_SYNC(inode))
ext4_handle_sync(handle);
@@ -4701,19 +4701,21 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
-static inline void ext4_iget_extra_inode(struct inode *inode,
+static inline int ext4_iget_extra_inode(struct inode *inode,
struct ext4_inode *raw_inode,
struct ext4_inode_info *ei)
{
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
EXT4_INODE_SIZE(inode->i_sb) &&
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
- ext4_find_inline_data_nolock(inode);
+ return ext4_find_inline_data_nolock(inode);
} else
EXT4_I(inode)->i_inline_off = 0;
+ return 0;
}
int ext4_get_projid(struct inode *inode, kprojid_t *projid)
@@ -4724,6 +4726,26 @@ int ext4_get_projid(struct inode *inode, kprojid_t *projid)
return 0;
}
+/*
+ * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
+ * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
+ * set.
+ */
+static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
+{
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ inode_set_iversion_raw(inode, val);
+ else
+ inode_set_iversion_queried(inode, val);
+}
+static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
+{
+ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return inode_peek_iversion_raw(inode);
+ else
+ return inode_peek_iversion(inode);
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -4893,7 +4915,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- ext4_iget_extra_inode(inode, raw_inode, ei);
+ ret = ext4_iget_extra_inode(inode, raw_inode, ei);
+ if (ret)
+ goto bad_inode;
}
}
@@ -4910,7 +4934,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ivers |=
(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
}
- inode_set_iversion_queried(inode, ivers);
+ ext4_inode_set_iversion_queried(inode, ivers);
}
ret = 0;
@@ -4945,6 +4969,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
+ /* VFS does not allow setting these so must be corruption */
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
+ EXT4_ERROR_INODE(inode,
+ "immutable or append flags not allowed on symlinks");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
if (ext4_encrypted_inode(inode)) {
inode->i_op = &ext4_encrypted_symlink_inode_operations;
ext4_set_aops(inode);
@@ -5196,7 +5227,7 @@ static int ext4_do_update_inode(handle_t *handle,
}
if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
- u64 ivers = inode_peek_iversion(inode);
+ u64 ivers = ext4_inode_peek_iversion(inode);
raw_inode->i_disk_version = cpu_to_le32(ivers);
if (ei->i_extra_isize) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6884e81c1465..6eae2b91aafa 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -470,6 +470,8 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
"freeing block already freed "
"(bit %u)",
first + i);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
}
@@ -747,10 +749,8 @@ void ext4_mb_generate_buddy(struct super_block *sb,
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- grp->bb_free);
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_set_largest_free_order(sb, grp);
@@ -1454,12 +1454,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
"freeing already freed block "
"(bit %u); block bitmap corrupt.",
block);
- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
- percpu_counter_sub(&sbi->s_freeclusters_counter,
- e4b->bd_info->bb_free);
- /* Mark the block group as corrupt. */
- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
- &e4b->bd_info->bb_state);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
mb_regenerate_buddy(e4b);
goto done;
}
@@ -1956,6 +1952,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
"%d free clusters as per "
"group info. But bitmap says 0",
free);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
@@ -1966,6 +1964,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
"%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
/*
* The number of free blocks differs. This mostly
* indicate that the bitmap is corrupt. So exit
@@ -2516,8 +2516,7 @@ static void ext4_groupinfo_destroy_slabs(void)
int i;
for (i = 0; i < NR_GRPINFO_CACHES; i++) {
- if (ext4_groupinfo_caches[i])
- kmem_cache_destroy(ext4_groupinfo_caches[i]);
+ kmem_cache_destroy(ext4_groupinfo_caches[i]);
ext4_groupinfo_caches[i] = NULL;
}
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b6bec270a8e4..d792b7689d92 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1933,7 +1933,7 @@ retry:
return 0;
n_group = ext4_get_group_number(sb, n_blocks_count - 1);
- if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
+ if (n_group >= (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
ext4_warning(sb, "resize would cause inodes_count overflow");
return -EINVAL;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eb104e8476f0..00fe75a71c4b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -763,6 +763,36 @@ __acquires(bitlock)
return;
}
+void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
+ ext4_group_t group,
+ unsigned int flags)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+ if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) &&
+ !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) {
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
+ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
+ &grp->bb_state);
+ }
+
+ if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) &&
+ !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ if (gdp) {
+ int count;
+
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
+ &grp->bb_state);
+ }
+}
+
void ext4_update_dynamic_rev(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -1237,19 +1267,13 @@ static bool ext4_dummy_context(struct inode *inode)
return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
}
-static unsigned ext4_max_namelen(struct inode *inode)
-{
- return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
- EXT4_NAME_LEN;
-}
-
static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
.dummy_context = ext4_dummy_context,
.empty_dir = ext4_empty_dir,
- .max_namelen = ext4_max_namelen,
+ .max_namelen = EXT4_NAME_LEN,
};
#endif
@@ -2116,12 +2140,12 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int read_only)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int res = 0;
+ int err = 0;
if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
- res = SB_RDONLY;
+ err = -EROFS;
}
if (read_only)
goto done;
@@ -2154,7 +2178,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
if (sbi->s_journal)
ext4_set_feature_journal_needs_recovery(sb);
- ext4_commit_super(sb, 1);
+ err = ext4_commit_super(sb, 1);
done:
if (test_opt(sb, DEBUG))
printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
@@ -2166,7 +2190,7 @@ done:
sbi->s_mount_opt, sbi->s_mount_opt2);
cleancache_init_fs(sb);
- return res;
+ return err;
}
int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
@@ -3732,8 +3756,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
" that may contain inline data");
sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
}
- err = bdev_dax_supported(sb, blocksize);
- if (err) {
+ if (!bdev_dax_supported(sb->s_bdev, blocksize)) {
ext4_msg(sb, KERN_ERR,
"DAX unsupported by block device. Turning off DAX.");
sbi->s_mount_opt &= ~EXT4_MOUNT_DAX;
@@ -4224,8 +4247,12 @@ no_journal:
goto failed_mount4;
}
- if (ext4_setup_super(sb, es, sb_rdonly(sb)))
+ ret = ext4_setup_super(sb, es, sb_rdonly(sb));
+ if (ret == -EROFS) {
sb->s_flags |= SB_RDONLY;
+ ret = 0;
+ } else if (ret)
+ goto failed_mount4a;
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
@@ -4760,11 +4787,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
unlock_buffer(sbh);
error = __sync_dirty_buffer(sbh,
REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
- if (error)
- return error;
-
- error = buffer_write_io_error(sbh);
- if (error) {
+ if (buffer_write_io_error(sbh)) {
ext4_msg(sb, KERN_ERR, "I/O error while writing "
"superblock");
clear_buffer_write_io_error(sbh);
@@ -5165,8 +5188,12 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (sbi->s_journal)
ext4_clear_journal_err(sb, es);
sbi->s_mount_state = le16_to_cpu(es->s_state);
- if (!ext4_setup_super(sb, es, 0))
- sb->s_flags &= ~SB_RDONLY;
+
+ err = ext4_setup_super(sb, es, 0);
+ if (err)
+ goto restore_opts;
+
+ sb->s_flags &= ~SB_RDONLY;
if (ext4_has_feature_mmp(sb))
if (ext4_multi_mount_protect(sb,
le64_to_cpu(es->s_mmp_block))) {
@@ -5190,8 +5217,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
ext4_setup_system_zone(sb);
- if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY))
- ext4_commit_super(sb, 1);
+ if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
+ err = ext4_commit_super(sb, 1);
+ if (err)
+ goto restore_opts;
+ }
#ifdef CONFIG_QUOTA
/* Release old quota file names */
@@ -5252,7 +5282,8 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bsoftlimit :
dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
if (limit && buf->f_blocks > limit) {
- curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+ curblock = (dquot->dq_dqb.dqb_curspace +
+ dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
buf->f_blocks = limit;
buf->f_bfree = buf->f_bavail =
(buf->f_blocks > curblock) ?
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 499cb4b1fbd2..fc4ced59c565 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1688,7 +1688,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
/* No failures allowed past this point. */
- if (!s->not_found && here->e_value_offs) {
+ if (!s->not_found && here->e_value_size && here->e_value_offs) {
/* Remove the old value. */
void *first_val = s->base + min_offs;
size_t offs = le16_to_cpu(here->e_value_offs);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 629001b28632..197a9d8a15ef 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -43,7 +43,7 @@ ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
err = ext4_xattr_set_handle(handle, inode,
EXT4_XATTR_INDEX_SECURITY,
xattr->name, xattr->value,
- xattr->value_len, 0);
+ xattr->value_len, XATTR_CREATE);
if (err < 0)
break;
}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 42d564c5ccd0..970ae27f401c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1930,19 +1930,13 @@ static bool f2fs_dummy_context(struct inode *inode)
return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode));
}
-static unsigned f2fs_max_namelen(struct inode *inode)
-{
- return S_ISLNK(inode->i_mode) ?
- inode->i_sb->s_blocksize : F2FS_NAME_LEN;
-}
-
static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
.dummy_context = f2fs_dummy_context,
.empty_dir = f2fs_empty_dir,
- .max_namelen = f2fs_max_namelen,
+ .max_namelen = F2FS_NAME_LEN,
};
#endif
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c42169459298..12273b6ea56d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -23,7 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
-#include <linux/shmem_fs.h>
+#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/poll.h>
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index ec85765502f1..5a48cee6d7d3 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -34,7 +34,7 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type)
return ERR_PTR(-ENOMEM);
size = fuse_getxattr(inode, name, value, PAGE_SIZE);
if (size > 0)
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ acl = posix_acl_from_xattr(fc->user_ns, value, size);
else if ((size == 0) || (size == -ENODATA) ||
(size == -EOPNOTSUPP && fc->no_getxattr))
acl = NULL;
@@ -81,7 +81,7 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type)
if (!value)
return -ENOMEM;
- ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ ret = posix_acl_to_xattr(fc->user_ns, acl, value, size);
if (ret < 0) {
kfree(value);
return ret;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index b9ea99c5b5b3..0b694655d988 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -35,7 +35,7 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
{
struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
if (fc) {
- fuse_abort_conn(fc);
+ fuse_abort_conn(fc, true);
fuse_conn_put(fc);
}
return count;
@@ -211,10 +211,11 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
if (!dentry)
return NULL;
- fc->ctl_dentry[fc->ctl_ndents++] = dentry;
inode = new_inode(fuse_control_sb);
- if (!inode)
+ if (!inode) {
+ dput(dentry);
return NULL;
+ }
inode->i_ino = get_next_ino();
inode->i_mode = mode;
@@ -228,6 +229,9 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
set_nlink(inode, nlink);
inode->i_private = fc;
d_add(dentry, inode);
+
+ fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+
return dentry;
}
@@ -284,7 +288,10 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
for (i = fc->ctl_ndents - 1; i >= 0; i--) {
struct dentry *dentry = fc->ctl_dentry[i];
d_inode(dentry)->i_private = NULL;
- d_drop(dentry);
+ if (!i) {
+ /* Get rid of submounts: */
+ d_invalidate(dentry);
+ }
dput(dentry);
}
drop_nlink(d_inode(fuse_control_sb->s_root));
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e9e97803442a..8f68181256c0 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -48,6 +48,7 @@
#include <linux/stat.h>
#include <linux/module.h>
#include <linux/uio.h>
+#include <linux/user_namespace.h>
#include "fuse_i.h"
@@ -406,7 +407,7 @@ err_unlock:
err_region:
unregister_chrdev_region(devt, 1);
err:
- fuse_abort_conn(fc);
+ fuse_abort_conn(fc, false);
goto out;
}
@@ -498,7 +499,11 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
if (!cc)
return -ENOMEM;
- fuse_conn_init(&cc->fc);
+ /*
+ * Limit the cuse channel to requests that can
+ * be represented in file->f_cred->user_ns.
+ */
+ fuse_conn_init(&cc->fc, file->f_cred->user_ns);
fud = fuse_dev_alloc(&cc->fc);
if (!fud) {
@@ -581,7 +586,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
{
struct cuse_conn *cc = dev_get_drvdata(dev);
- fuse_abort_conn(&cc->fc);
+ fuse_abort_conn(&cc->fc, false);
return count;
}
static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 5d06384c2cae..e03ca14f40e9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -112,13 +112,6 @@ static void __fuse_put_request(struct fuse_req *req)
refcount_dec(&req->count);
}
-static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req)
-{
- req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
- req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
- req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
-}
-
void fuse_set_initialized(struct fuse_conn *fc)
{
/* Make sure stores before this are seen on another CPU */
@@ -163,11 +156,19 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
goto out;
}
- fuse_req_init_context(fc, req);
+ req->in.h.uid = from_kuid(fc->user_ns, current_fsuid());
+ req->in.h.gid = from_kgid(fc->user_ns, current_fsgid());
+ req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
+
__set_bit(FR_WAITING, &req->flags);
if (for_background)
__set_bit(FR_BACKGROUND, &req->flags);
+ if (unlikely(req->in.h.uid == ((uid_t)-1) ||
+ req->in.h.gid == ((gid_t)-1))) {
+ fuse_put_request(fc, req);
+ return ERR_PTR(-EOVERFLOW);
+ }
return req;
out:
@@ -256,7 +257,10 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
if (!req)
req = get_reserved_req(fc, file);
- fuse_req_init_context(fc, req);
+ req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
+ req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
+ req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
+
__set_bit(FR_WAITING, &req->flags);
__clear_bit(FR_BACKGROUND, &req->flags);
return req;
@@ -381,8 +385,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
wake_up(&fc->blocked_waitq);
- if (fc->num_background == fc->congestion_threshold &&
- fc->connected && fc->sb) {
+ if (fc->num_background == fc->congestion_threshold && fc->sb) {
clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
@@ -1234,9 +1237,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
if (err)
goto err_unlock;
- err = -ENODEV;
- if (!fiq->connected)
+ if (!fiq->connected) {
+ err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
goto err_unlock;
+ }
if (!list_empty(&fiq->interrupts)) {
req = list_entry(fiq->interrupts.next, struct fuse_req,
@@ -1260,12 +1264,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
in = &req->in;
reqsize = in->h.len;
- if (task_active_pid_ns(current) != fc->pid_ns) {
- rcu_read_lock();
- in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
- rcu_read_unlock();
- }
-
/* If request is too large, reply with an error and restart the read */
if (nbytes < reqsize) {
req->out.h.error = -EIO;
@@ -1287,7 +1285,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
spin_lock(&fpq->lock);
clear_bit(FR_LOCKED, &req->flags);
if (!fpq->connected) {
- err = -ENODEV;
+ err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV;
goto out_end;
}
if (err) {
@@ -2076,7 +2074,7 @@ static void end_polls(struct fuse_conn *fc)
* is OK, the request will in that case be removed from the list before we touch
* it.
*/
-void fuse_abort_conn(struct fuse_conn *fc)
+void fuse_abort_conn(struct fuse_conn *fc, bool is_abort)
{
struct fuse_iqueue *fiq = &fc->iq;
@@ -2089,6 +2087,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
fc->connected = 0;
fc->blocked = 0;
+ fc->aborted = is_abort;
fuse_set_initialized(fc);
list_for_each_entry(fud, &fc->devices, entry) {
struct fuse_pqueue *fpq = &fud->pq;
@@ -2151,7 +2150,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
/* Are we the last open device? */
if (atomic_dec_and_test(&fc->dev_count)) {
WARN_ON(fc->iq.fasync != NULL);
- fuse_abort_conn(fc);
+ fuse_abort_conn(fc, false);
}
fuse_dev_free(fud);
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 24967382a7b1..56231b31f806 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
stat->ino = attr->ino;
stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
stat->nlink = attr->nlink;
- stat->uid = make_kuid(&init_user_ns, attr->uid);
- stat->gid = make_kgid(&init_user_ns, attr->gid);
+ stat->uid = make_kuid(fc->user_ns, attr->uid);
+ stat->gid = make_kgid(fc->user_ns, attr->gid);
stat->rdev = inode->i_rdev;
stat->atime.tv_sec = attr->atime;
stat->atime.tv_nsec = attr->atimensec;
@@ -924,12 +924,20 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
}
static int fuse_update_get_attr(struct inode *inode, struct file *file,
- struct kstat *stat)
+ struct kstat *stat, unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
int err = 0;
+ bool sync;
- if (time_before64(fi->i_time, get_jiffies_64())) {
+ if (flags & AT_STATX_FORCE_SYNC)
+ sync = true;
+ else if (flags & AT_STATX_DONT_SYNC)
+ sync = false;
+ else
+ sync = time_before64(fi->i_time, get_jiffies_64());
+
+ if (sync) {
forget_all_cached_acls(inode);
err = fuse_do_getattr(inode, stat, file);
} else if (stat) {
@@ -943,7 +951,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file,
int fuse_update_attributes(struct inode *inode, struct file *file)
{
- return fuse_update_get_attr(inode, file, NULL);
+ return fuse_update_get_attr(inode, file, NULL, 0);
}
int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
@@ -1030,7 +1038,7 @@ int fuse_allow_current_process(struct fuse_conn *fc)
const struct cred *cred;
if (fc->allow_other)
- return 1;
+ return current_in_userns(fc->user_ns);
cred = current_cred();
if (uid_eq(cred->euid, fc->user_id) &&
@@ -1475,17 +1483,17 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
return true;
}
-static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
- bool trust_local_cmtime)
+static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr,
+ struct fuse_setattr_in *arg, bool trust_local_cmtime)
{
unsigned ivalid = iattr->ia_valid;
if (ivalid & ATTR_MODE)
arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
if (ivalid & ATTR_UID)
- arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
+ arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid);
if (ivalid & ATTR_GID)
- arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
+ arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid);
if (ivalid & ATTR_SIZE)
arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
if (ivalid & ATTR_ATIME) {
@@ -1629,8 +1637,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
return err;
if (attr->ia_valid & ATTR_OPEN) {
- if (fc->atomic_o_trunc)
+ /* This is coming from open(..., ... | O_TRUNC); */
+ WARN_ON(!(attr->ia_valid & ATTR_SIZE));
+ WARN_ON(attr->ia_size != 0);
+ if (fc->atomic_o_trunc) {
+ /*
+ * No need to send request to userspace, since actual
+ * truncation has already been done by OPEN. But still
+ * need to truncate page cache.
+ */
+ i_size_write(inode, 0);
+ truncate_pagecache(inode, 0);
return 0;
+ }
file = NULL;
}
@@ -1646,7 +1665,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- iattr_to_fattr(attr, &inarg, trust_local_cmtime);
+ iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime);
if (file) {
struct fuse_file *ff = file->private_data;
inarg.valid |= FATTR_FH;
@@ -1783,7 +1802,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat,
if (!fuse_allow_current_process(fc))
return -EACCES;
- return fuse_update_get_attr(inode, NULL, stat);
+ return fuse_update_get_attr(inode, NULL, stat, flags);
}
static const struct inode_operations fuse_dir_inode_operations = {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c4c093bbf456..5256ad333b05 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -26,6 +26,7 @@
#include <linux/xattr.h>
#include <linux/pid_namespace.h>
#include <linux/refcount.h>
+#include <linux/user_namespace.h>
/** Max number of pages that can be used in a single read request */
#define FUSE_MAX_PAGES_PER_REQ 32
@@ -466,6 +467,9 @@ struct fuse_conn {
/** The pid namespace for this mount */
struct pid_namespace *pid_ns;
+ /** The user namespace for this mount */
+ struct user_namespace *user_ns;
+
/** Maximum read size */
unsigned max_read;
@@ -515,6 +519,9 @@ struct fuse_conn {
abort and device release */
unsigned connected;
+ /** Connection aborted via sysfs */
+ bool aborted;
+
/** Connection failed (version mismatch). Cannot race with
setting other bitfields since it is only set once in INIT
reply, before any other request, and never cleared */
@@ -526,6 +533,9 @@ struct fuse_conn {
/** Do readpages asynchronously? Only set in INIT */
unsigned async_read:1;
+ /** Return an unique read error after abort. Only set in INIT */
+ unsigned abort_err:1;
+
/** Do not send separate SETATTR request before open(O_TRUNC) */
unsigned atomic_o_trunc:1;
@@ -851,7 +861,7 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
struct fuse_req *req);
/* Abort all requests */
-void fuse_abort_conn(struct fuse_conn *fc);
+void fuse_abort_conn(struct fuse_conn *fc, bool is_abort);
/**
* Invalidate inode attributes
@@ -870,7 +880,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
/**
* Initialize fuse_conn
*/
-void fuse_conn_init(struct fuse_conn *fc);
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns);
/**
* Release reference to fuse_conn
@@ -975,6 +985,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
int fuse_removexattr(struct inode *inode, const char *name);
extern const struct xattr_handler *fuse_xattr_handlers[];
extern const struct xattr_handler *fuse_acl_xattr_handlers[];
+extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
struct posix_acl;
struct posix_acl *fuse_get_acl(struct inode *inode, int type);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index ef309958e060..ffcaf98044b9 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -171,8 +171,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_ino = fuse_squash_ino(attr->ino);
inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
set_nlink(inode, attr->nlink);
- inode->i_uid = make_kuid(&init_user_ns, attr->uid);
- inode->i_gid = make_kgid(&init_user_ns, attr->gid);
+ inode->i_uid = make_kuid(fc->user_ns, attr->uid);
+ inode->i_gid = make_kgid(fc->user_ns, attr->gid);
inode->i_blocks = attr->blocks;
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
@@ -371,7 +371,7 @@ void fuse_unlock_inode(struct inode *inode)
static void fuse_umount_begin(struct super_block *sb)
{
- fuse_abort_conn(get_fuse_conn_super(sb));
+ fuse_abort_conn(get_fuse_conn_super(sb), false);
}
static void fuse_send_destroy(struct fuse_conn *fc)
@@ -393,7 +393,7 @@ static void fuse_put_super(struct super_block *sb)
fuse_send_destroy(fc);
- fuse_abort_conn(fc);
+ fuse_abort_conn(fc, false);
mutex_lock(&fuse_mutex);
list_del(&fc->entry);
fuse_ctl_remove_conn(fc);
@@ -477,7 +477,8 @@ static int fuse_match_uint(substring_t *s, unsigned int *res)
return err;
}
-static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev,
+ struct user_namespace *user_ns)
{
char *p;
memset(d, 0, sizeof(struct fuse_mount_data));
@@ -513,7 +514,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
case OPT_USER_ID:
if (fuse_match_uint(&args[0], &uv))
return 0;
- d->user_id = make_kuid(current_user_ns(), uv);
+ d->user_id = make_kuid(user_ns, uv);
if (!uid_valid(d->user_id))
return 0;
d->user_id_present = 1;
@@ -522,7 +523,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
case OPT_GROUP_ID:
if (fuse_match_uint(&args[0], &uv))
return 0;
- d->group_id = make_kgid(current_user_ns(), uv);
+ d->group_id = make_kgid(user_ns, uv);
if (!gid_valid(d->group_id))
return 0;
d->group_id_present = 1;
@@ -565,8 +566,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
struct super_block *sb = root->d_sb;
struct fuse_conn *fc = get_fuse_conn_super(sb);
- seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
- seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
+ seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id));
if (fc->default_permissions)
seq_puts(m, ",default_permissions");
if (fc->allow_other)
@@ -597,7 +598,7 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
fpq->connected = 1;
}
-void fuse_conn_init(struct fuse_conn *fc)
+void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
{
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
@@ -621,6 +622,7 @@ void fuse_conn_init(struct fuse_conn *fc)
fc->attr_version = 1;
get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
fc->pid_ns = get_pid_ns(task_active_pid_ns(current));
+ fc->user_ns = get_user_ns(user_ns);
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
@@ -630,6 +632,7 @@ void fuse_conn_put(struct fuse_conn *fc)
if (fc->destroy_req)
fuse_request_free(fc->destroy_req);
put_pid_ns(fc->pid_ns);
+ put_user_ns(fc->user_ns);
fc->release(fc);
}
}
@@ -918,6 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->posix_acl = 1;
fc->sb->s_xattr = fuse_acl_xattr_handlers;
}
+ if (arg->flags & FUSE_ABORT_ERROR)
+ fc->abort_err = 1;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
@@ -948,7 +953,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
- FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL;
+ FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
+ FUSE_ABORT_ERROR;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -1061,7 +1067,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
- if (!parse_fuse_opt(data, &d, is_bdev))
+ if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns))
goto err;
if (is_bdev) {
@@ -1089,16 +1095,27 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (!file)
goto err;
- if ((file->f_op != &fuse_dev_operations) ||
- (file->f_cred->user_ns != &init_user_ns))
+ /*
+ * Require mount to happen from the same user namespace which
+ * opened /dev/fuse to prevent potential attacks.
+ */
+ if (file->f_op != &fuse_dev_operations ||
+ file->f_cred->user_ns != sb->s_user_ns)
goto err_fput;
+ /*
+ * If we are not in the initial user namespace posix
+ * acls must be translated.
+ */
+ if (sb->s_user_ns != &init_user_ns)
+ sb->s_xattr = fuse_no_acl_xattr_handlers;
+
fc = kmalloc(sizeof(*fc), GFP_KERNEL);
err = -ENOMEM;
if (!fc)
goto err_fput;
- fuse_conn_init(fc);
+ fuse_conn_init(fc, sb->s_user_ns);
fc->release = fuse_free_conn;
fud = fuse_dev_alloc(fc);
@@ -1179,6 +1196,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
fuse_dev_free(fud);
err_put_conn:
fuse_conn_put(fc);
+ sb->s_fs_info = NULL;
err_fput:
fput(file);
err:
@@ -1208,7 +1226,7 @@ static void fuse_kill_sb_anon(struct super_block *sb)
static struct file_system_type fuse_fs_type = {
.owner = THIS_MODULE,
.name = "fuse",
- .fs_flags = FS_HAS_SUBTYPE,
+ .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
.mount = fuse_mount,
.kill_sb = fuse_kill_sb_anon,
};
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 3caac46b08b0..433717640f78 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -192,6 +192,26 @@ static int fuse_xattr_set(const struct xattr_handler *handler,
return fuse_setxattr(inode, name, value, size, flags);
}
+static bool no_xattr_list(struct dentry *dentry)
+{
+ return false;
+}
+
+static int no_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *value, size_t size)
+{
+ return -EOPNOTSUPP;
+}
+
+static int no_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *nodee,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
static const struct xattr_handler fuse_xattr_handler = {
.prefix = "",
.get = fuse_xattr_get,
@@ -209,3 +229,26 @@ const struct xattr_handler *fuse_acl_xattr_handlers[] = {
&fuse_xattr_handler,
NULL
};
+
+static const struct xattr_handler fuse_no_acl_access_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .list = no_xattr_list,
+ .get = no_xattr_get,
+ .set = no_xattr_set,
+};
+
+static const struct xattr_handler fuse_no_acl_default_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+ .flags = ACL_TYPE_ACCESS,
+ .list = no_xattr_list,
+ .get = no_xattr_get,
+ .set = no_xattr_set,
+};
+
+const struct xattr_handler *fuse_no_acl_xattr_handlers[] = {
+ &fuse_no_acl_access_xattr_handler,
+ &fuse_no_acl_default_xattr_handler,
+ &fuse_xattr_handler,
+ NULL
+};
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index a7b586e02693..ed6699705c13 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -767,10 +767,11 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
sizeof(struct gfs2_dinode);
iomap->offset = 0;
iomap->length = i_size_read(inode);
- iomap->type = IOMAP_MAPPED;
- iomap->flags = IOMAP_F_DATA_INLINE;
+ iomap->type = IOMAP_INLINE;
}
+#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
+
/**
* gfs2_iomap_get - Map blocks from an inode to disk blocks
* @inode: The inode
@@ -846,7 +847,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
iomap->type = IOMAP_MAPPED;
iomap->flags = IOMAP_F_MERGED;
if (eob)
- iomap->flags |= IOMAP_F_BOUNDARY;
+ iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
out:
iomap->bdev = inode->i_sb->s_bdev;
@@ -952,12 +953,12 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
if (iomap.length > bh_map->b_size) {
iomap.length = bh_map->b_size;
- iomap.flags &= ~IOMAP_F_BOUNDARY;
+ iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
}
if (iomap.addr != IOMAP_NULL_ADDR)
map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
bh_map->b_size = iomap.length;
- if (iomap.flags & IOMAP_F_BOUNDARY)
+ if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
set_buffer_boundary(bh_map);
if (iomap.flags & IOMAP_F_NEW)
set_buffer_new(bh_map);
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2577ef1034ef..2a153aed4c19 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -26,8 +26,7 @@
#include "hpfs.h"
#define EIOERROR EIO
-#define EFSERROR EPERM
-#define EMEMERROR ENOMEM
+#define EFSERROR EUCLEAN
#define ANODE_ALLOC_FWD 512
#define FNODE_ALLOC_FWD 0
diff --git a/fs/inode.c b/fs/inode.c
index 3b55391072f3..0df41bb77e0f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1004,6 +1004,70 @@ void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
EXPORT_SYMBOL(unlock_two_nondirectories);
/**
+ * inode_insert5 - obtain an inode from a mounted file system
+ * @inode: pre-allocated inode to use for insert to cache
+ * @hashval: hash value (usually inode number) to get
+ * @test: callback used for comparisons between inodes
+ * @set: callback used to initialize a new struct inode
+ * @data: opaque data pointer to pass to @test and @set
+ *
+ * Search for the inode specified by @hashval and @data in the inode cache,
+ * and if present it is return it with an increased reference count. This is
+ * a variant of iget5_locked() for callers that don't want to fail on memory
+ * allocation of inode.
+ *
+ * If the inode is not in cache, insert the pre-allocated inode to cache and
+ * return it locked, hashed, and with the I_NEW flag set. The file system gets
+ * to fill it in before unlocking it via unlock_new_inode().
+ *
+ * Note both @test and @set are called with the inode_hash_lock held, so can't
+ * sleep.
+ */
+struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
+ int (*test)(struct inode *, void *),
+ int (*set)(struct inode *, void *), void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
+ struct inode *old;
+
+again:
+ spin_lock(&inode_hash_lock);
+ old = find_inode(inode->i_sb, head, test, data);
+ if (unlikely(old)) {
+ /*
+ * Uhhuh, somebody else created the same inode under us.
+ * Use the old inode instead of the preallocated one.
+ */
+ spin_unlock(&inode_hash_lock);
+ wait_on_inode(old);
+ if (unlikely(inode_unhashed(old))) {
+ iput(old);
+ goto again;
+ }
+ return old;
+ }
+
+ if (set && unlikely(set(inode, data))) {
+ inode = NULL;
+ goto unlock;
+ }
+
+ /*
+ * Return the locked inode with I_NEW set, the
+ * caller is responsible for filling in the contents
+ */
+ spin_lock(&inode->i_lock);
+ inode->i_state |= I_NEW;
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode->i_lock);
+unlock:
+ spin_unlock(&inode_hash_lock);
+
+ return inode;
+}
+EXPORT_SYMBOL(inode_insert5);
+
+/**
* iget5_locked - obtain an inode from a mounted file system
* @sb: super block of file system
* @hashval: hash value (usually inode number) to get
@@ -1027,66 +1091,18 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *data)
{
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
- struct inode *inode;
-again:
- spin_lock(&inode_hash_lock);
- inode = find_inode(sb, head, test, data);
- spin_unlock(&inode_hash_lock);
+ struct inode *inode = ilookup5(sb, hashval, test, data);
- if (inode) {
- wait_on_inode(inode);
- if (unlikely(inode_unhashed(inode))) {
- iput(inode);
- goto again;
- }
- return inode;
- }
+ if (!inode) {
+ struct inode *new = new_inode(sb);
- inode = alloc_inode(sb);
- if (inode) {
- struct inode *old;
-
- spin_lock(&inode_hash_lock);
- /* We released the lock, so.. */
- old = find_inode(sb, head, test, data);
- if (!old) {
- if (set(inode, data))
- goto set_failed;
-
- spin_lock(&inode->i_lock);
- inode->i_state = I_NEW;
- hlist_add_head(&inode->i_hash, head);
- spin_unlock(&inode->i_lock);
- inode_sb_list_add(inode);
- spin_unlock(&inode_hash_lock);
-
- /* Return the locked inode with I_NEW set, the
- * caller is responsible for filling in the contents
- */
- return inode;
- }
-
- /*
- * Uhhuh, somebody else created the same inode under
- * us. Use the old inode instead of the one we just
- * allocated.
- */
- spin_unlock(&inode_hash_lock);
- destroy_inode(inode);
- inode = old;
- wait_on_inode(inode);
- if (unlikely(inode_unhashed(inode))) {
- iput(inode);
- goto again;
+ if (new) {
+ inode = inode_insert5(new, hashval, test, set, data);
+ if (unlikely(inode != new))
+ iput(new);
}
}
return inode;
-
-set_failed:
- spin_unlock(&inode_hash_lock);
- destroy_inode(inode);
- return NULL;
}
EXPORT_SYMBOL(iget5_locked);
@@ -1427,43 +1443,13 @@ EXPORT_SYMBOL(insert_inode_locked);
int insert_inode_locked4(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
- struct super_block *sb = inode->i_sb;
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *old = inode_insert5(inode, hashval, test, NULL, data);
- while (1) {
- struct inode *old = NULL;
-
- spin_lock(&inode_hash_lock);
- hlist_for_each_entry(old, head, i_hash) {
- if (old->i_sb != sb)
- continue;
- if (!test(old, data))
- continue;
- spin_lock(&old->i_lock);
- if (old->i_state & (I_FREEING|I_WILL_FREE)) {
- spin_unlock(&old->i_lock);
- continue;
- }
- break;
- }
- if (likely(!old)) {
- spin_lock(&inode->i_lock);
- inode->i_state |= I_NEW;
- hlist_add_head(&inode->i_hash, head);
- spin_unlock(&inode->i_lock);
- spin_unlock(&inode_hash_lock);
- return 0;
- }
- __iget(old);
- spin_unlock(&old->i_lock);
- spin_unlock(&inode_hash_lock);
- wait_on_inode(old);
- if (unlikely(!inode_unhashed(old))) {
- iput(old);
- return -EBUSY;
- }
+ if (old != inode) {
iput(old);
+ return -EBUSY;
}
+ return 0;
}
EXPORT_SYMBOL(insert_inode_locked4);
diff --git a/fs/iomap.c b/fs/iomap.c
index afd163586aa0..7d1e9f45f098 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -20,6 +20,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
+#include <linux/pagevec.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/backing-dev.h>
@@ -27,6 +28,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include <linux/sched/signal.h>
+#include <linux/swap.h>
#include "internal.h"
@@ -95,6 +97,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
return written ? written : ret;
}
+static sector_t
+iomap_sector(struct iomap *iomap, loff_t pos)
+{
+ return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
+}
+
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
@@ -352,11 +360,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
struct iomap *iomap)
{
- sector_t sector = (iomap->addr +
- (pos & PAGE_MASK) - iomap->offset) >> 9;
-
- return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
- offset, bytes);
+ return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
+ iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
}
static loff_t
@@ -501,10 +506,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
case IOMAP_DELALLOC:
flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
break;
+ case IOMAP_MAPPED:
+ break;
case IOMAP_UNWRITTEN:
flags |= FIEMAP_EXTENT_UNWRITTEN;
break;
- case IOMAP_MAPPED:
+ case IOMAP_INLINE:
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
break;
}
@@ -512,8 +520,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
flags |= FIEMAP_EXTENT_MERGED;
if (iomap->flags & IOMAP_F_SHARED)
flags |= FIEMAP_EXTENT_SHARED;
- if (iomap->flags & IOMAP_F_DATA_INLINE)
- flags |= FIEMAP_EXTENT_DATA_INLINE;
return fiemap_fill_next_extent(fi, iomap->offset,
iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
@@ -587,6 +593,113 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
}
EXPORT_SYMBOL_GPL(iomap_fiemap);
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
+ * Returns true if found and updates @lastoff to the offset in file.
+ */
+static bool
+page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff,
+ int whence)
+{
+ const struct address_space_operations *ops = inode->i_mapping->a_ops;
+ unsigned int bsize = i_blocksize(inode), off;
+ bool seek_data = whence == SEEK_DATA;
+ loff_t poff = page_offset(page);
+
+ if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE))
+ return false;
+
+ if (*lastoff < poff) {
+ /*
+ * Last offset smaller than the start of the page means we found
+ * a hole:
+ */
+ if (whence == SEEK_HOLE)
+ return true;
+ *lastoff = poff;
+ }
+
+ /*
+ * Just check the page unless we can and should check block ranges:
+ */
+ if (bsize == PAGE_SIZE || !ops->is_partially_uptodate)
+ return PageUptodate(page) == seek_data;
+
+ lock_page(page);
+ if (unlikely(page->mapping != inode->i_mapping))
+ goto out_unlock_not_found;
+
+ for (off = 0; off < PAGE_SIZE; off += bsize) {
+ if ((*lastoff & ~PAGE_MASK) >= off + bsize)
+ continue;
+ if (ops->is_partially_uptodate(page, off, bsize) == seek_data) {
+ unlock_page(page);
+ return true;
+ }
+ *lastoff = poff + off + bsize;
+ }
+
+out_unlock_not_found:
+ unlock_page(page);
+ return false;
+}
+
+/*
+ * Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ *
+ * Within unwritten extents, the page cache determines which parts are holes
+ * and which are data: uptodate buffer heads count as data; everything else
+ * counts as a hole.
+ *
+ * Returns the resulting offset on successs, and -ENOENT otherwise.
+ */
+static loff_t
+page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
+ int whence)
+{
+ pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+ loff_t lastoff = offset;
+ struct pagevec pvec;
+
+ if (length <= 0)
+ return -ENOENT;
+
+ pagevec_init(&pvec);
+
+ do {
+ unsigned nr_pages, i;
+
+ nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
+ end - 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (page_seek_hole_data(inode, page, &lastoff, whence))
+ goto check_range;
+ lastoff = page_offset(page) + PAGE_SIZE;
+ }
+ pagevec_release(&pvec);
+ } while (index < end);
+
+ /* When no page at lastoff and we are not done, we found a hole. */
+ if (whence != SEEK_HOLE)
+ goto not_found;
+
+check_range:
+ if (lastoff < offset + length)
+ goto out;
+not_found:
+ lastoff = -ENOENT;
+out:
+ pagevec_release(&pvec);
+ return lastoff;
+}
+
+
static loff_t
iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
void *data, struct iomap *iomap)
@@ -685,6 +798,8 @@ EXPORT_SYMBOL_GPL(iomap_seek_data);
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_WRITE_FUA (1 << 28)
+#define IOMAP_DIO_NEED_SYNC (1 << 29)
#define IOMAP_DIO_WRITE (1 << 30)
#define IOMAP_DIO_DIRTY (1 << 31)
@@ -759,6 +874,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
dio_warn_stale_pagecache(iocb->ki_filp);
}
+ /*
+ * If this is a DSYNC write, make sure we push it to stable storage now
+ * that we've written data.
+ */
+ if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+ ret = generic_write_sync(iocb, ret);
+
inode_dio_end(file_inode(iocb->ki_filp));
kfree(dio);
@@ -769,13 +891,8 @@ static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
struct kiocb *iocb = dio->iocb;
- bool is_write = (dio->flags & IOMAP_DIO_WRITE);
- ssize_t ret;
- ret = iomap_dio_complete(dio);
- if (is_write && ret > 0)
- ret = generic_write_sync(iocb, ret);
- iocb->ki_complete(iocb, ret, 0);
+ iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
}
/*
@@ -833,14 +950,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
bio = bio_alloc(GFP_KERNEL, 1);
bio_set_dev(bio, iomap->bdev);
- bio->bi_iter.bi_sector =
- (iomap->addr + pos - iomap->offset) >> 9;
+ bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
get_page(page);
- if (bio_add_page(bio, page, len, 0) != len)
- BUG();
+ __bio_add_page(bio, page, len, 0);
bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
atomic_inc(&dio->ref);
@@ -858,6 +973,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
struct iov_iter iter;
struct bio *bio;
bool need_zeroout = false;
+ bool use_fua = false;
int nr_pages, ret;
size_t copied = 0;
@@ -881,8 +997,20 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
case IOMAP_MAPPED:
if (iomap->flags & IOMAP_F_SHARED)
dio->flags |= IOMAP_DIO_COW;
- if (iomap->flags & IOMAP_F_NEW)
+ if (iomap->flags & IOMAP_F_NEW) {
need_zeroout = true;
+ } else {
+ /*
+ * Use a FUA write if we need datasync semantics, this
+ * is a pure data IO that doesn't require any metadata
+ * updates and the underlying device supports FUA. This
+ * allows us to avoid cache flushes on IO completion.
+ */
+ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
+ (dio->flags & IOMAP_DIO_WRITE_FUA) &&
+ blk_queue_fua(bdev_get_queue(iomap->bdev)))
+ use_fua = true;
+ }
break;
default:
WARN_ON_ONCE(1);
@@ -916,9 +1044,9 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
bio = bio_alloc(GFP_KERNEL, nr_pages);
bio_set_dev(bio, iomap->bdev);
- bio->bi_iter.bi_sector =
- (iomap->addr + pos - iomap->offset) >> 9;
+ bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
bio->bi_write_hint = dio->iocb->ki_hint;
+ bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
@@ -930,10 +1058,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
n = bio->bi_iter.bi_size;
if (dio->flags & IOMAP_DIO_WRITE) {
- bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+ bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+ if (use_fua)
+ bio->bi_opf |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_FUA;
task_io_account_write(n);
} else {
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_opf = REQ_OP_READ;
if (dio->flags & IOMAP_DIO_DIRTY)
bio_set_pages_dirty(bio);
}
@@ -961,6 +1093,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
return copied;
}
+/*
+ * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
+ * is being issued as AIO or not. This allows us to optimise pure data writes
+ * to use REQ_FUA rather than requiring generic_write_sync() to issue a
+ * REQ_FLUSH post write. This is slightly tricky because a single request here
+ * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
+ * may be pure data writes. In that case, we still need to do a full data sync
+ * completion.
+ */
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
@@ -1005,8 +1146,21 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iter->type == ITER_IOVEC)
dio->flags |= IOMAP_DIO_DIRTY;
} else {
- dio->flags |= IOMAP_DIO_WRITE;
flags |= IOMAP_WRITE;
+ dio->flags |= IOMAP_DIO_WRITE;
+
+ /* for data sync or sync, we need sync completion processing */
+ if (iocb->ki_flags & IOCB_DSYNC)
+ dio->flags |= IOMAP_DIO_NEED_SYNC;
+
+ /*
+ * For datasync only writes, we optimistically try using FUA for
+ * this IO. Any non-FUA write that occurs will clear this flag,
+ * hence we know before completion whether a cache flush is
+ * necessary.
+ */
+ if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
+ dio->flags |= IOMAP_DIO_WRITE_FUA;
}
if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -1062,6 +1216,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret < 0)
iomap_dio_set_error(dio, ret);
+ /*
+ * If all the writes we issued were FUA, we don't need to flush the
+ * cache on IO completion. Clear the sync flag for this case.
+ */
+ if (dio->flags & IOMAP_DIO_WRITE_FUA)
+ dio->flags &= ~IOMAP_DIO_NEED_SYNC;
+
if (!atomic_dec_and_test(&dio->ref)) {
if (!is_sync_kiocb(iocb))
return -EIOCBQUEUED;
@@ -1089,3 +1250,203 @@ out_free_dio:
return ret;
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+/* Swapfile activation */
+
+#ifdef CONFIG_SWAP
+struct iomap_swapfile_info {
+ struct iomap iomap; /* accumulated iomap */
+ struct swap_info_struct *sis;
+ uint64_t lowest_ppage; /* lowest physical addr seen (pages) */
+ uint64_t highest_ppage; /* highest physical addr seen (pages) */
+ unsigned long nr_pages; /* number of pages collected */
+ int nr_extents; /* extent count */
+};
+
+/*
+ * Collect physical extents for this swap file. Physical extents reported to
+ * the swap code must be trimmed to align to a page boundary. The logical
+ * offset within the file is irrelevant since the swapfile code maps logical
+ * page numbers of the swap device to the physical page-aligned extents.
+ */
+static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
+{
+ struct iomap *iomap = &isi->iomap;
+ unsigned long nr_pages;
+ uint64_t first_ppage;
+ uint64_t first_ppage_reported;
+ uint64_t next_ppage;
+ int error;
+
+ /*
+ * Round the start up and the end down so that the physical
+ * extent aligns to a page boundary.
+ */
+ first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
+ next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
+ PAGE_SHIFT;
+
+ /* Skip too-short physical extents. */
+ if (first_ppage >= next_ppage)
+ return 0;
+ nr_pages = next_ppage - first_ppage;
+
+ /*
+ * Calculate how much swap space we're adding; the first page contains
+ * the swap header and doesn't count. The mm still wants that first
+ * page fed to add_swap_extent, however.
+ */
+ first_ppage_reported = first_ppage;
+ if (iomap->offset == 0)
+ first_ppage_reported++;
+ if (isi->lowest_ppage > first_ppage_reported)
+ isi->lowest_ppage = first_ppage_reported;
+ if (isi->highest_ppage < (next_ppage - 1))
+ isi->highest_ppage = next_ppage - 1;
+
+ /* Add extent, set up for the next call. */
+ error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
+ if (error < 0)
+ return error;
+ isi->nr_extents += error;
+ isi->nr_pages += nr_pages;
+ return 0;
+}
+
+/*
+ * Accumulate iomaps for this swap file. We have to accumulate iomaps because
+ * swap only cares about contiguous page-aligned physical extents and makes no
+ * distinction between written and unwritten extents.
+ */
+static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
+ loff_t count, void *data, struct iomap *iomap)
+{
+ struct iomap_swapfile_info *isi = data;
+ int error;
+
+ switch (iomap->type) {
+ case IOMAP_MAPPED:
+ case IOMAP_UNWRITTEN:
+ /* Only real or unwritten extents. */
+ break;
+ case IOMAP_INLINE:
+ /* No inline data. */
+ pr_err("swapon: file is inline\n");
+ return -EINVAL;
+ default:
+ pr_err("swapon: file has unallocated extents\n");
+ return -EINVAL;
+ }
+
+ /* No uncommitted metadata or shared blocks. */
+ if (iomap->flags & IOMAP_F_DIRTY) {
+ pr_err("swapon: file is not committed\n");
+ return -EINVAL;
+ }
+ if (iomap->flags & IOMAP_F_SHARED) {
+ pr_err("swapon: file has shared extents\n");
+ return -EINVAL;
+ }
+
+ /* Only one bdev per swap file. */
+ if (iomap->bdev != isi->sis->bdev) {
+ pr_err("swapon: file is on multiple devices\n");
+ return -EINVAL;
+ }
+
+ if (isi->iomap.length == 0) {
+ /* No accumulated extent, so just store it. */
+ memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+ } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
+ /* Append this to the accumulated extent. */
+ isi->iomap.length += iomap->length;
+ } else {
+ /* Otherwise, add the retained iomap and store this one. */
+ error = iomap_swapfile_add_extent(isi);
+ if (error)
+ return error;
+ memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
+ }
+ return count;
+}
+
+/*
+ * Iterate a swap file's iomaps to construct physical extents that can be
+ * passed to the swapfile subsystem.
+ */
+int iomap_swapfile_activate(struct swap_info_struct *sis,
+ struct file *swap_file, sector_t *pagespan,
+ const struct iomap_ops *ops)
+{
+ struct iomap_swapfile_info isi = {
+ .sis = sis,
+ .lowest_ppage = (sector_t)-1ULL,
+ };
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
+ loff_t pos = 0;
+ loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
+ loff_t ret;
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ return ret;
+
+ while (len > 0) {
+ ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
+ ops, &isi, iomap_swapfile_activate_actor);
+ if (ret <= 0)
+ return ret;
+
+ pos += ret;
+ len -= ret;
+ }
+
+ if (isi.iomap.length) {
+ ret = iomap_swapfile_add_extent(&isi);
+ if (ret)
+ return ret;
+ }
+
+ *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
+ sis->max = isi.nr_pages;
+ sis->pages = isi.nr_pages - 1;
+ sis->highest_bit = isi.nr_pages - 1;
+ return isi.nr_extents;
+}
+EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
+#endif /* CONFIG_SWAP */
+
+static loff_t
+iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
+ void *data, struct iomap *iomap)
+{
+ sector_t *bno = data, addr;
+
+ if (iomap->type == IOMAP_MAPPED) {
+ addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
+ if (addr > INT_MAX)
+ WARN(1, "would truncate bmap result\n");
+ else
+ *bno = addr;
+ }
+ return 0;
+}
+
+/* legacy ->bmap interface. 0 is the error return (!) */
+sector_t
+iomap_bmap(struct address_space *mapping, sector_t bno,
+ const struct iomap_ops *ops)
+{
+ struct inode *inode = mapping->host;
+ loff_t pos = bno >> inode->i_blkbits;
+ unsigned blocksize = i_blocksize(inode);
+
+ if (filemap_write_and_wait(mapping))
+ return 0;
+
+ bno = 0;
+ iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
+ return bno;
+}
+EXPORT_SYMBOL_GPL(iomap_bmap);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index dfb057900e79..8ef6b6daaa7a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -114,7 +114,7 @@ void __jbd2_debug(int level, const char *file, const char *func,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
+ printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
va_end(args);
}
EXPORT_SYMBOL(__jbd2_debug);
@@ -2302,8 +2302,7 @@ static void jbd2_journal_destroy_slabs(void)
int i;
for (i = 0; i < JBD2_MAX_SLABS; i++) {
- if (jbd2_slab[i])
- kmem_cache_destroy(jbd2_slab[i]);
+ kmem_cache_destroy(jbd2_slab[i]);
jbd2_slab[i] = NULL;
}
}
@@ -2404,10 +2403,8 @@ static int jbd2_journal_init_journal_head_cache(void)
static void jbd2_journal_destroy_journal_head_cache(void)
{
- if (jbd2_journal_head_cache) {
- kmem_cache_destroy(jbd2_journal_head_cache);
- jbd2_journal_head_cache = NULL;
- }
+ kmem_cache_destroy(jbd2_journal_head_cache);
+ jbd2_journal_head_cache = NULL;
}
/*
@@ -2665,11 +2662,10 @@ static int __init jbd2_journal_init_handle_cache(void)
static void jbd2_journal_destroy_handle_cache(void)
{
- if (jbd2_handle_cache)
- kmem_cache_destroy(jbd2_handle_cache);
- if (jbd2_inode_cache)
- kmem_cache_destroy(jbd2_inode_cache);
-
+ kmem_cache_destroy(jbd2_handle_cache);
+ jbd2_handle_cache = NULL;
+ kmem_cache_destroy(jbd2_inode_cache);
+ jbd2_inode_cache = NULL;
}
/*
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 696ef15ec942..240779e4689c 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -180,14 +180,10 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal,
void jbd2_journal_destroy_revoke_caches(void)
{
- if (jbd2_revoke_record_cache) {
- kmem_cache_destroy(jbd2_revoke_record_cache);
- jbd2_revoke_record_cache = NULL;
- }
- if (jbd2_revoke_table_cache) {
- kmem_cache_destroy(jbd2_revoke_table_cache);
- jbd2_revoke_table_cache = NULL;
- }
+ kmem_cache_destroy(jbd2_revoke_record_cache);
+ jbd2_revoke_record_cache = NULL;
+ kmem_cache_destroy(jbd2_revoke_table_cache);
+ jbd2_revoke_table_cache = NULL;
}
int __init jbd2_journal_init_revoke_caches(void)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 8aa453784402..51dd68e67b0f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -49,10 +49,8 @@ int __init jbd2_journal_init_transaction_cache(void)
void jbd2_journal_destroy_transaction_cache(void)
{
- if (transaction_cache) {
- kmem_cache_destroy(transaction_cache);
- transaction_cache = NULL;
- }
+ kmem_cache_destroy(transaction_cache);
+ transaction_cache = NULL;
}
void jbd2_journal_free_transaction(transaction_t *transaction)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index fd5ce883072e..2015d8c45e4a 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -348,11 +348,11 @@ static void kernfs_vma_open(struct vm_area_struct *vma)
kernfs_put_active(of->kn);
}
-static int kernfs_vma_fault(struct vm_fault *vmf)
+static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
- int ret;
+ vm_fault_t ret;
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
@@ -368,11 +368,11 @@ static int kernfs_vma_fault(struct vm_fault *vmf)
return ret;
}
-static int kernfs_vma_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf)
{
struct file *file = vmf->vma->vm_file;
struct kernfs_open_file *of = kernfs_of(file);
- int ret;
+ vm_fault_t ret;
if (!of->vm_ops)
return VM_FAULT_SIGBUS;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 97a972efab83..68728de12864 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -788,35 +788,34 @@ static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
spin_unlock(&lockres->l_lock);
}
-static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
- struct ocfs2_lock_holder *oh)
-{
- spin_lock(&lockres->l_lock);
- list_del(&oh->oh_list);
- spin_unlock(&lockres->l_lock);
-
- put_pid(oh->oh_owner_pid);
-}
-
-static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres)
+static struct ocfs2_lock_holder *
+ocfs2_pid_holder(struct ocfs2_lock_res *lockres,
+ struct pid *pid)
{
struct ocfs2_lock_holder *oh;
- struct pid *pid;
- /* look in the list of holders for one with the current task as owner */
spin_lock(&lockres->l_lock);
- pid = task_pid(current);
list_for_each_entry(oh, &lockres->l_holders, oh_list) {
if (oh->oh_owner_pid == pid) {
spin_unlock(&lockres->l_lock);
- return 1;
+ return oh;
}
}
spin_unlock(&lockres->l_lock);
+ return NULL;
+}
- return 0;
+static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
+ struct ocfs2_lock_holder *oh)
+{
+ spin_lock(&lockres->l_lock);
+ list_del(&oh->oh_list);
+ spin_unlock(&lockres->l_lock);
+
+ put_pid(oh->oh_owner_pid);
}
+
static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
int level)
{
@@ -2610,34 +2609,93 @@ void ocfs2_inode_unlock(struct inode *inode,
*
* return < 0 on error, return == 0 if there's no lock holder on the stack
* before this call, return == 1 if this call would be a recursive locking.
+ * return == -1 if this lock attempt will cause an upgrade which is forbidden.
+ *
+ * When taking lock levels into account,we face some different situations.
+ *
+ * 1. no lock is held
+ * In this case, just lock the inode as requested and return 0
+ *
+ * 2. We are holding a lock
+ * For this situation, things diverges into several cases
+ *
+ * wanted holding what to do
+ * ex ex see 2.1 below
+ * ex pr see 2.2 below
+ * pr ex see 2.1 below
+ * pr pr see 2.1 below
+ *
+ * 2.1 lock level that is been held is compatible
+ * with the wanted level, so no lock action will be tacken.
+ *
+ * 2.2 Otherwise, an upgrade is needed, but it is forbidden.
+ *
+ * Reason why upgrade within a process is forbidden is that
+ * lock upgrade may cause dead lock. The following illustrates
+ * how it happens.
+ *
+ * thread on node1 thread on node2
+ * ocfs2_inode_lock_tracker(ex=0)
+ *
+ * <====== ocfs2_inode_lock_tracker(ex=1)
+ *
+ * ocfs2_inode_lock_tracker(ex=1)
*/
int ocfs2_inode_lock_tracker(struct inode *inode,
struct buffer_head **ret_bh,
int ex,
struct ocfs2_lock_holder *oh)
{
- int status;
- int arg_flags = 0, has_locked;
+ int status = 0;
struct ocfs2_lock_res *lockres;
+ struct ocfs2_lock_holder *tmp_oh;
+ struct pid *pid = task_pid(current);
+
lockres = &OCFS2_I(inode)->ip_inode_lockres;
- has_locked = ocfs2_is_locked_by_me(lockres);
- /* Just get buffer head if the cluster lock has been taken */
- if (has_locked)
- arg_flags = OCFS2_META_LOCK_GETBH;
+ tmp_oh = ocfs2_pid_holder(lockres, pid);
- if (likely(!has_locked || ret_bh)) {
- status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags);
+ if (!tmp_oh) {
+ /*
+ * This corresponds to the case 1.
+ * We haven't got any lock before.
+ */
+ status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
return status;
}
- }
- if (!has_locked)
+
+ oh->oh_ex = ex;
ocfs2_add_holder(lockres, oh);
+ return 0;
+ }
- return has_locked;
+ if (unlikely(ex && !tmp_oh->oh_ex)) {
+ /*
+ * case 2.2 upgrade may cause dead lock, forbid it.
+ */
+ mlog(ML_ERROR, "Recursive locking is not permitted to "
+ "upgrade to EX level from PR level.\n");
+ dump_stack();
+ return -EINVAL;
+ }
+
+ /*
+ * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full.
+ * ignore the lock level and just update it.
+ */
+ if (ret_bh) {
+ status = ocfs2_inode_lock_full(inode, ret_bh, ex,
+ OCFS2_META_LOCK_GETBH);
+ if (status < 0) {
+ if (status != -ENOENT)
+ mlog_errno(status);
+ return status;
+ }
+ }
+ return tmp_oh ? 1 : 0;
}
void ocfs2_inode_unlock_tracker(struct inode *inode,
@@ -2649,12 +2707,13 @@ void ocfs2_inode_unlock_tracker(struct inode *inode,
lockres = &OCFS2_I(inode)->ip_inode_lockres;
/* had_lock means that the currect process already takes the cluster
- * lock previously. If had_lock is 1, we have nothing to do here, and
- * it will get unlocked where we got the lock.
+ * lock previously.
+ * If had_lock is 1, we have nothing to do here.
+ * If had_lock is 0, we will release the lock.
*/
if (!had_lock) {
+ ocfs2_inode_unlock(inode, oh->oh_ex);
ocfs2_remove_holder(lockres, oh);
- ocfs2_inode_unlock(inode, ex);
}
}
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 256e0a9067b8..4ec1c828f6e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -96,6 +96,7 @@ struct ocfs2_trim_fs_info {
struct ocfs2_lock_holder {
struct list_head oh_list;
struct pid *oh_owner_pid;
+ int oh_ex;
};
/* ocfs2_inode_lock_full() 'arg_flags' flags */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6ee94bc23f5b..a2a8603d27e0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -563,8 +563,8 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
return ret;
}
-static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
- u32 clusters_to_add, int mark_unwritten)
+static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten)
{
int status = 0;
int restart_func = 0;
@@ -1035,8 +1035,8 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
clusters_to_add -= oi->ip_clusters;
if (clusters_to_add) {
- ret = __ocfs2_extend_allocation(inode, oi->ip_clusters,
- clusters_to_add, 0);
+ ret = ocfs2_extend_allocation(inode, oi->ip_clusters,
+ clusters_to_add, 0);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1493,7 +1493,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
goto next;
}
- ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+ ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
if (ret) {
if (ret != -ENOSPC)
mlog_errno(ret);
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 1fdc9839cd93..7eb7f03531f6 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -65,8 +65,6 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
u64 new_i_size, u64 zero_to);
int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
loff_t zero_to);
-int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
- u32 clusters_to_add, int mark_unwritten);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index ab30c005cc4b..994726ada857 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -402,7 +402,7 @@ out_err:
static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist,
unsigned int chunksize)
{
- int index;
+ u32 index;
index = __ilog2_u32(chunksize);
if (index >= OCFS2_INFO_MAX_HIST)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index fb9a20e3d608..05220b365fb9 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -44,11 +44,11 @@
#include "ocfs2_trace.h"
-static int ocfs2_fault(struct vm_fault *vmf)
+static vm_fault_t ocfs2_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
sigset_t oldset;
- int ret;
+ vm_fault_t ret;
ocfs2_block_signals(&oldset);
ret = filemap_fault(vmf);
@@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_fault *vmf)
return ret;
}
-static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
- struct page *page)
+static vm_fault_t __ocfs2_page_mkwrite(struct file *file,
+ struct buffer_head *di_bh, struct page *page)
{
- int ret = VM_FAULT_NOPAGE;
+ int err;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
loff_t pos = page_offset(page);
@@ -105,15 +106,12 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
if (page->index == last_index)
len = ((size - 1) & ~PAGE_MASK) + 1;
- ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
+ err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
&locked_page, &fsdata, di_bh, page);
- if (ret) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
+ if (err) {
+ if (err != -ENOSPC)
+ mlog_errno(err);
+ ret = vmf_error(err);
goto out;
}
@@ -121,20 +119,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
ret = VM_FAULT_NOPAGE;
goto out;
}
- ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
- BUG_ON(ret != len);
+ err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
+ BUG_ON(err != len);
ret = VM_FAULT_LOCKED;
out:
return ret;
}
-static int ocfs2_page_mkwrite(struct vm_fault *vmf)
+static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct buffer_head *di_bh = NULL;
sigset_t oldset;
- int ret;
+ int err;
+ vm_fault_t ret;
sb_start_pagefault(inode->i_sb);
ocfs2_block_signals(&oldset);
@@ -144,13 +143,10 @@ static int ocfs2_page_mkwrite(struct vm_fault *vmf)
* node. Taking the data lock will also ensure that we don't
* attempt page truncation as part of a downconvert.
*/
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret < 0) {
- mlog_errno(ret);
- if (ret == -ENOMEM)
- ret = VM_FAULT_OOM;
- else
- ret = VM_FAULT_SIGBUS;
+ err = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (err < 0) {
+ mlog_errno(err);
+ ret = vmf_error(err);
goto out;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8dd6f703c819..b7ca84bc3df7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2332,8 +2332,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
struct buffer_head *orphan_dir_bh,
bool dio)
{
- const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
- char name[namelen + 1];
+ char name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
struct ocfs2_dinode *orphan_fe;
int status = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5bb4a89f9045..7071ad0dec90 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -807,11 +807,11 @@ struct ocfs2_dir_block_trailer {
* in this block. (unused) */
/*10*/ __u8 db_signature[8]; /* Signature for verification */
__le64 db_reserved2;
- __le64 db_free_next; /* Next block in list (unused) */
-/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */
- __le64 db_parent_dinode; /* dinode which owns me, in
+/*20*/ __le64 db_free_next; /* Next block in list (unused) */
+ __le64 db_blkno; /* Offset on disk, in blocks */
+/*30*/ __le64 db_parent_dinode; /* dinode which owns me, in
blocks */
-/*30*/ struct ocfs2_block_check db_check; /* Error checking */
+ struct ocfs2_block_check db_check; /* Error checking */
/*40*/
};
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index 66369ec90020..74b37cbbd5d4 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -281,14 +281,17 @@ restart:
ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
if (ret != 0)
goto error;
- ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
+ ret = copy_to_user(buf + sizeof(__s32), &magic, sizeof(__s32));
if (ret != 0)
goto error;
- ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
+ ret = copy_to_user(buf + 2 * sizeof(__s32),
+ &cur_op->tag,
+ sizeof(__u64));
if (ret != 0)
goto error;
- ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
- sizeof(struct orangefs_upcall_s));
+ ret = copy_to_user(buf + 2 * sizeof(__s32) + sizeof(__u64),
+ &cur_op->upcall,
+ sizeof(struct orangefs_upcall_s));
if (ret != 0)
goto error;
@@ -381,7 +384,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
(unsigned int) MAX_DEV_REQ_DOWNSIZE);
return -EFAULT;
}
-
+
if (!copy_from_iter_full(&head, head_size, iter)) {
gossip_err("%s: failed to copy head.\n", __func__);
return -EFAULT;
@@ -426,7 +429,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
goto wakeup;
/*
- * We've successfully peeled off the head and the downcall.
+ * We've successfully peeled off the head and the downcall.
* Something has gone awry if total doesn't equal the
* sum of head_size, downcall_size and trailer_size.
*/
@@ -477,7 +480,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
wakeup:
/*
* Return to vfs waitqueue, and back to service_operation
- * through wait_for_matching_downcall.
+ * through wait_for_matching_downcall.
*/
spin_lock(&op->lock);
if (unlikely(op_is_cancel(op))) {
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 26358efbf794..db0b52187cbc 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -162,7 +162,7 @@ populate_shared_memory:
else
ret = 0;
break;
- /*
+ /*
* If the op was in progress when the interrupt
* occurred, then the client-core was able to
* trigger the write.
@@ -544,7 +544,7 @@ static int orangefs_fault(struct vm_fault *vmf)
return filemap_fault(vmf);
}
-const struct vm_operations_struct orangefs_file_vm_ops = {
+static const struct vm_operations_struct orangefs_file_vm_ops = {
.fault = orangefs_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 79c61da8b1bc..d6db252e6200 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -20,8 +20,8 @@ static int read_one_page(struct page *page)
int max_block;
ssize_t bytes_read = 0;
struct inode *inode = page->mapping->host;
- const __u32 blocksize = PAGE_SIZE; /* inode->i_blksize */
- const __u32 blockbits = PAGE_SHIFT; /* inode->i_blkbits */
+ const __u32 blocksize = PAGE_SIZE;
+ const __u32 blockbits = PAGE_SHIFT;
struct iov_iter to;
struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
@@ -181,16 +181,15 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
new_op->upcall.req.truncate.refn = orangefs_inode->refn;
new_op->upcall.req.truncate.size = (__s64) iattr->ia_size;
- ret = service_operation(new_op, __func__,
- get_interruptible_flag(inode));
+ ret = service_operation(new_op,
+ __func__,
+ get_interruptible_flag(inode));
/*
* the truncate has no downcall members to retrieve, but
* the status value tells us if it went through ok or not
*/
- gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs: orangefs_truncate got return value of %d\n",
- ret);
+ gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret);
op_release(new_op);
@@ -212,8 +211,9 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
struct inode *inode = dentry->d_inode;
gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_setattr: called on %pd\n",
- dentry);
+ "%s: called on %pd\n",
+ __func__,
+ dentry);
ret = setattr_prepare(dentry, iattr);
if (ret)
@@ -230,15 +230,16 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
ret = orangefs_inode_setattr(inode, iattr);
gossip_debug(GOSSIP_INODE_DEBUG,
- "orangefs_setattr: inode_setattr returned %d\n",
- ret);
+ "%s: orangefs_inode_setattr returned %d\n",
+ __func__,
+ ret);
if (!ret && (iattr->ia_valid & ATTR_MODE))
/* change mod on a file that has ACLs */
ret = posix_acl_chmod(inode, inode->i_mode);
out:
- gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", ret);
+ gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret);
return ret;
}
@@ -262,13 +263,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
/* override block size reported to stat */
orangefs_inode = ORANGEFS_I(inode);
- stat->blksize = orangefs_inode->blksize;
if (request_mask & STATX_SIZE)
stat->result_mask = STATX_BASIC_STATS;
else
stat->result_mask = STATX_BASIC_STATS &
~STATX_SIZE;
+
+ stat->attributes_mask = STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_APPEND;
+ if (inode->i_flags & S_IMMUTABLE)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (inode->i_flags & S_APPEND)
+ stat->attributes |= STATX_ATTR_APPEND;
}
return ret;
}
@@ -306,7 +313,7 @@ int orangefs_update_time(struct inode *inode, struct timespec *time, int flags)
return orangefs_inode_setattr(inode, &iattr);
}
-/* ORANGEDS2 implementation of VFS inode operations for files */
+/* ORANGEFS2 implementation of VFS inode operations for files */
static const struct inode_operations orangefs_file_inode_operations = {
.get_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
@@ -325,7 +332,6 @@ static int orangefs_init_iops(struct inode *inode)
case S_IFREG:
inode->i_op = &orangefs_file_inode_operations;
inode->i_fop = &orangefs_file_operations;
- inode->i_blkbits = PAGE_SHIFT;
break;
case S_IFLNK:
inode->i_op = &orangefs_symlink_inode_operations;
@@ -345,8 +351,8 @@ static int orangefs_init_iops(struct inode *inode)
}
/*
- * Given a ORANGEFS object identifier (fsid, handle), convert it into a ino_t type
- * that will be used as a hash-index from where the handle will
+ * Given an ORANGEFS object identifier (fsid, handle), convert it into
+ * a ino_t type that will be used as a hash-index from where the handle will
* be searched for in the VFS hash table of inodes.
*/
static inline ino_t orangefs_handle_hash(struct orangefs_object_kref *ref)
@@ -376,8 +382,10 @@ static int orangefs_test_inode(struct inode *inode, void *data)
struct orangefs_inode_s *orangefs_inode = NULL;
orangefs_inode = ORANGEFS_I(inode);
- return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle), &(ref->khandle))
- && orangefs_inode->refn.fs_id == ref->fs_id);
+ /* test handles and fs_ids... */
+ return (!ORANGEFS_khandle_cmp(&(orangefs_inode->refn.khandle),
+ &(ref->khandle)) &&
+ orangefs_inode->refn.fs_id == ref->fs_id);
}
/*
@@ -385,16 +393,21 @@ static int orangefs_test_inode(struct inode *inode, void *data)
* file handle.
*
* @sb: the file system super block instance.
- * @ref: The ORANGEFS object for which we are trying to locate an inode structure.
+ * @ref: The ORANGEFS object for which we are trying to locate an inode.
*/
-struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref)
+struct inode *orangefs_iget(struct super_block *sb,
+ struct orangefs_object_kref *ref)
{
struct inode *inode = NULL;
unsigned long hash;
int error;
hash = orangefs_handle_hash(ref);
- inode = iget5_locked(sb, hash, orangefs_test_inode, orangefs_set_inode, ref);
+ inode = iget5_locked(sb,
+ hash,
+ orangefs_test_inode,
+ orangefs_set_inode,
+ ref);
if (!inode || !(inode->i_state & I_NEW))
return inode;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 365cd73d9109..625b0580f9be 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -278,6 +278,13 @@ static int orangefs_symlink(struct inode *dir,
ret = PTR_ERR(inode);
goto out;
}
+ /*
+ * This is necessary because orangefs_inode_getattr will not
+ * re-read symlink size as it is impossible for it to change.
+ * Invalidating the cache does not help. orangefs_new_inode
+ * does not set the correct size (it does not know symname).
+ */
+ inode->i_size = strlen(symname);
gossip_debug(GOSSIP_NAME_DEBUG,
"Assigned symlink inode new number of %pU\n",
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 4f927023d095..c4e98c9c1621 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -138,7 +138,7 @@ static int get(struct slot_map *m)
/* used to describe mapped buffers */
struct orangefs_bufmap_desc {
- void *uaddr; /* user space address pointer */
+ void __user *uaddr; /* user space address pointer */
struct page **page_array; /* array of mapped pages */
int array_count; /* size of above arrays */
struct list_head list_link;
@@ -184,7 +184,7 @@ orangefs_bufmap_free(struct orangefs_bufmap *bufmap)
}
/*
- * XXX: Can the size and shift change while the caller gives up the
+ * XXX: Can the size and shift change while the caller gives up the
* XXX: lock between calling this and doing something useful?
*/
@@ -215,20 +215,6 @@ int orangefs_bufmap_shift_query(void)
static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);
-/*
- * orangefs_get_bufmap_init
- *
- * If bufmap_init is 1, then the shared memory system, including the
- * buffer_index_array, is available. Otherwise, it is not.
- *
- * returns the value of bufmap_init
- */
-int orangefs_get_bufmap_init(void)
-{
- return __orangefs_bufmap ? 1 : 0;
-}
-
-
static struct orangefs_bufmap *
orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
{
@@ -496,7 +482,7 @@ void orangefs_readdir_index_put(int buffer_index)
}
/*
- * we've been handed an iovec, we need to copy it to
+ * we've been handed an iovec, we need to copy it to
* the shared memory descriptor at "buffer_index".
*/
int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter,
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 6e35f2f3c897..0732cb08173e 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -114,7 +114,7 @@ static const struct seq_operations help_debug_ops = {
.show = help_show,
};
-const struct file_operations debug_help_fops = {
+static const struct file_operations debug_help_fops = {
.owner = THIS_MODULE,
.open = orangefs_debug_help_open,
.read = seq_read,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index c29bb0ebc6bb..004511617b6d 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -182,7 +182,6 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
struct orangefs_inode_s {
struct orangefs_object_kref refn;
char link_target[ORANGEFS_NAME_MAX];
- __s64 blksize;
/*
* Reading/Writing Extended attributes need to acquire the appropriate
* reader/writer semaphore on the orangefs_inode_s structure.
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 00fadaf0da8f..804c8a261e4b 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -183,9 +183,9 @@ static inline int copy_attributes_from_inode(struct inode *inode,
attrs->mask |= ORANGEFS_ATTR_SYS_CTIME;
/*
- * ORANGEFS cannot set size with a setattr operation. Probably not likely
- * to be requested through the VFS, but just in case, don't worry about
- * ATTR_SIZE
+ * ORANGEFS cannot set size with a setattr operation. Probably not
+ * likely to be requested through the VFS, but just in case, don't
+ * worry about ATTR_SIZE
*/
if (iattr->ia_valid & ATTR_MODE) {
@@ -200,14 +200,16 @@ static inline int copy_attributes_from_inode(struct inode *inode,
tmp_mode -= S_ISVTX;
} else {
gossip_debug(GOSSIP_UTILS_DEBUG,
- "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
+ "%s: setting sticky bit not supported.\n",
+ __func__);
return -EINVAL;
}
}
if (tmp_mode & (S_ISUID)) {
gossip_debug(GOSSIP_UTILS_DEBUG,
- "Attempting to set setuid bit (not supported); returning EINVAL.\n");
+ "%s: setting setuid bit not supported.\n",
+ __func__);
return -EINVAL;
}
@@ -275,7 +277,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
- loff_t inode_size, rounded_up_size;
+ loff_t inode_size;
int ret, type;
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
@@ -330,22 +332,19 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
if (request_mask & STATX_SIZE || new) {
inode_size = (loff_t)new_op->
downcall.resp.getattr.attributes.size;
- rounded_up_size =
- (inode_size + (4096 - (inode_size % 4096)));
inode->i_size = inode_size;
- orangefs_inode->blksize =
- new_op->downcall.resp.getattr.attributes.blksize;
+ inode->i_blkbits = ffs(new_op->downcall.resp.getattr.
+ attributes.blksize);
spin_lock(&inode->i_lock);
inode->i_bytes = inode_size;
inode->i_blocks =
- (unsigned long)(rounded_up_size / 512);
+ (inode_size + 512 - inode_size % 512)/512;
spin_unlock(&inode->i_lock);
}
break;
case S_IFDIR:
if (request_mask & STATX_SIZE || new) {
inode->i_size = PAGE_SIZE;
- orangefs_inode->blksize = i_blocksize(inode);
spin_lock(&inode->i_lock);
inode_set_bytes(inode, inode->i_size);
spin_unlock(&inode->i_lock);
@@ -356,7 +355,6 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
if (new) {
inode->i_size = (loff_t)strlen(new_op->
downcall.resp.getattr.link_target);
- orangefs_inode->blksize = i_blocksize(inode);
ret = strscpy(orangefs_inode->link_target,
new_op->downcall.resp.getattr.link_target,
ORANGEFS_NAME_MAX);
@@ -525,7 +523,9 @@ int orangefs_normalize_to_errno(__s32 error_code)
error_code = -ETIMEDOUT;
} else {
/* assume a default error code */
- gossip_err("orangefs: warning: got error code without errno equivalent: %d.\n", error_code);
+ gossip_err("%s: bad error code :%d:.\n",
+ __func__,
+ error_code);
error_code = -EINVAL;
}
@@ -542,7 +542,7 @@ int orangefs_normalize_to_errno(__s32 error_code)
* there is a bug somewhere.
*/
} else {
- gossip_err("orangefs: orangefs_normalize_to_errno: got error code which is not from ORANGEFS.\n");
+ gossip_err("%s: unknown error code.\n", __func__);
error_code = -EINVAL;
}
return error_code;
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 61ee8d64c842..d403cf29a99b 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -342,7 +342,7 @@ enum {
* that may be 32 bit!
*/
struct ORANGEFS_dev_map_desc {
- void *ptr;
+ void __user *ptr;
__s32 total_size;
__s32 size;
__s32 count;
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 10796d3fe27d..dfaee90d30bd 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -156,9 +156,10 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf)
sb = dentry->d_sb;
gossip_debug(GOSSIP_SUPER_DEBUG,
- "orangefs_statfs: called on sb %p (fs_id is %d)\n",
- sb,
- (int)(ORANGEFS_SB(sb)->fs_id));
+ "%s: called on sb %p (fs_id is %d)\n",
+ __func__,
+ sb,
+ (int)(ORANGEFS_SB(sb)->fs_id));
new_op = op_alloc(ORANGEFS_VFS_OP_STATFS);
if (!new_op)
@@ -198,7 +199,7 @@ static int orangefs_statfs(struct dentry *dentry, struct kstatfs *buf)
out_op_release:
op_release(new_op);
- gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_statfs: returning %d\n", ret);
+ gossip_debug(GOSSIP_SUPER_DEBUG, "%s: returning %d\n", __func__, ret);
return ret;
}
@@ -423,8 +424,8 @@ static int orangefs_fill_sb(struct super_block *sb,
sb->s_op = &orangefs_s_ops;
sb->s_d_op = &orangefs_dentry_operations;
- sb->s_blocksize = orangefs_bufmap_size_query();
- sb->s_blocksize_bits = orangefs_bufmap_shift_query();
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_maxbytes = MAX_LFS_FILESIZE;
root_object.khandle = ORANGEFS_SB(sb)->root_khandle;
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index 0577d6dba8c8..0729d2645d6a 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -17,8 +17,12 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
-static int wait_for_matching_downcall(struct orangefs_kernel_op_s *, long, bool);
-static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *);
+static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
+ long timeout,
+ bool interruptible)
+ __acquires(op->lock);
+static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
+ __releases(op->lock);
/*
* What we do in this function is to walk the list of operations that are
@@ -246,6 +250,7 @@ bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op)
*/
static void
orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op)
+ __releases(op->lock)
{
/*
* handle interrupted cases depending on what state we were in when
@@ -313,8 +318,9 @@ static void
* Returns with op->lock taken.
*/
static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op,
- long timeout,
- bool interruptible)
+ long timeout,
+ bool interruptible)
+ __acquires(op->lock)
{
long n;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 17032631c5cf..9384164253ac 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -11,7 +11,7 @@ config OVERLAY_FS
For more information see Documentation/filesystems/overlayfs.txt
config OVERLAY_FS_REDIRECT_DIR
- bool "Overlayfs: turn on redirect dir feature by default"
+ bool "Overlayfs: turn on redirect directory feature by default"
depends on OVERLAY_FS
help
If this config option is enabled then overlay filesystems will use
@@ -46,7 +46,7 @@ config OVERLAY_FS_INDEX
depends on OVERLAY_FS
help
If this config option is enabled then overlay filesystems will use
- the inodes index dir to map lower inodes to upper inodes by default.
+ the index directory to map lower inodes to upper inodes by default.
In this case it is still possible to turn off index globally with the
"index=off" module option or on a filesystem instance basis with the
"index=off" mount option.
@@ -66,7 +66,7 @@ config OVERLAY_FS_NFS_EXPORT
depends on OVERLAY_FS_INDEX
help
If this config option is enabled then overlay filesystems will use
- the inodes index dir to decode overlay NFS file handles by default.
+ the index directory to decode overlay NFS file handles by default.
In this case, it is still possible to turn off NFS export support
globally with the "nfs_export=off" module option or on a filesystem
instance basis with the "nfs_export=off" mount option.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8bede0742619..ddaddb4ce4c3 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -365,17 +365,14 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
if (err)
return err;
- temp = ovl_lookup_temp(indexdir);
+ temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0));
+ err = PTR_ERR(temp);
if (IS_ERR(temp))
- goto temp_err;
-
- err = ovl_do_mkdir(dir, temp, S_IFDIR, true);
- if (err)
- goto out;
+ goto free_name;
err = ovl_set_upper_fh(upper, temp);
if (err)
- goto out_cleanup;
+ goto out;
index = lookup_one_len(name.name, indexdir, name.len);
if (IS_ERR(index)) {
@@ -384,23 +381,13 @@ static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
err = ovl_do_rename(dir, temp, dir, index, 0);
dput(index);
}
-
- if (err)
- goto out_cleanup;
-
out:
+ if (err)
+ ovl_cleanup(dir, temp);
dput(temp);
+free_name:
kfree(name.name);
return err;
-
-temp_err:
- err = PTR_ERR(temp);
- temp = NULL;
- goto out;
-
-out_cleanup:
- ovl_cleanup(dir, temp);
- goto out;
}
struct ovl_copy_up_ctx {
@@ -439,8 +426,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
c->dentry->d_name.len);
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
- err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper,
- true);
+ err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper);
dput(upper);
if (!err) {
@@ -470,7 +456,7 @@ static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
return PTR_ERR(upper);
if (c->tmpfile)
- err = ovl_do_link(temp, udir, upper, true);
+ err = ovl_do_link(temp, udir, upper);
else
err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
@@ -481,13 +467,13 @@ static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
return err;
}
-static int ovl_get_tmpfile(struct ovl_copy_up_ctx *c, struct dentry **tempp)
+static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
{
int err;
struct dentry *temp;
const struct cred *old_creds = NULL;
struct cred *new_creds = NULL;
- struct cattr cattr = {
+ struct ovl_cattr cattr = {
/* Can't properly set mode on creation because of the umask */
.mode = c->stat.mode & S_IFMT,
.rdev = c->stat.rdev,
@@ -495,41 +481,24 @@ static int ovl_get_tmpfile(struct ovl_copy_up_ctx *c, struct dentry **tempp)
};
err = security_inode_copy_up(c->dentry, &new_creds);
+ temp = ERR_PTR(err);
if (err < 0)
goto out;
if (new_creds)
old_creds = override_creds(new_creds);
- if (c->tmpfile) {
+ if (c->tmpfile)
temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
- if (IS_ERR(temp))
- goto temp_err;
- } else {
- temp = ovl_lookup_temp(c->workdir);
- if (IS_ERR(temp))
- goto temp_err;
-
- err = ovl_create_real(d_inode(c->workdir), temp, &cattr,
- NULL, true);
- if (err) {
- dput(temp);
- goto out;
- }
- }
- err = 0;
- *tempp = temp;
+ else
+ temp = ovl_create_temp(c->workdir, &cattr);
out:
if (new_creds) {
revert_creds(old_creds);
put_cred(new_creds);
}
- return err;
-
-temp_err:
- err = PTR_ERR(temp);
- goto out;
+ return temp;
}
static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
@@ -579,21 +548,21 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
struct inode *udir = c->destdir->d_inode;
struct inode *inode;
struct dentry *newdentry = NULL;
- struct dentry *temp = NULL;
+ struct dentry *temp;
int err;
- err = ovl_get_tmpfile(c, &temp);
- if (err)
- goto out;
+ temp = ovl_get_tmpfile(c);
+ if (IS_ERR(temp))
+ return PTR_ERR(temp);
err = ovl_copy_up_inode(c, temp);
if (err)
- goto out_cleanup;
+ goto out;
if (S_ISDIR(c->stat.mode) && c->indexed) {
err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
if (err)
- goto out_cleanup;
+ goto out;
}
if (c->tmpfile) {
@@ -604,7 +573,7 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
err = ovl_install_temp(c, temp, &newdentry);
}
if (err)
- goto out_cleanup;
+ goto out;
inode = d_inode(c->dentry);
ovl_inode_update(inode, newdentry);
@@ -612,13 +581,11 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
ovl_set_flag(OVL_WHITEOUTS, inode);
out:
+ if (err && !c->tmpfile)
+ ovl_cleanup(d_inode(c->workdir), temp);
dput(temp);
return err;
-out_cleanup:
- if (!c->tmpfile)
- ovl_cleanup(d_inode(c->workdir), temp);
- goto out;
}
/*
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 839709c7803a..f480b1a2cd2e 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -43,7 +43,7 @@ int ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
return err;
}
-struct dentry *ovl_lookup_temp(struct dentry *workdir)
+static struct dentry *ovl_lookup_temp(struct dentry *workdir)
{
struct dentry *temp;
char name[20];
@@ -114,36 +114,72 @@ kill_whiteout:
goto out;
}
-int ovl_create_real(struct inode *dir, struct dentry *newdentry,
- struct cattr *attr, struct dentry *hardlink, bool debug)
+static int ovl_mkdir_real(struct inode *dir, struct dentry **newdentry,
+ umode_t mode)
{
int err;
+ struct dentry *d, *dentry = *newdentry;
+ err = ovl_do_mkdir(dir, dentry, mode);
+ if (err)
+ return err;
+
+ if (likely(!d_unhashed(dentry)))
+ return 0;
+
+ /*
+ * vfs_mkdir() may succeed and leave the dentry passed
+ * to it unhashed and negative. If that happens, try to
+ * lookup a new hashed and positive dentry.
+ */
+ d = lookup_one_len(dentry->d_name.name, dentry->d_parent,
+ dentry->d_name.len);
+ if (IS_ERR(d)) {
+ pr_warn("overlayfs: failed lookup after mkdir (%pd2, err=%i).\n",
+ dentry, err);
+ return PTR_ERR(d);
+ }
+ dput(dentry);
+ *newdentry = d;
+
+ return 0;
+}
+
+struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
+ struct ovl_cattr *attr)
+{
+ int err;
+
+ if (IS_ERR(newdentry))
+ return newdentry;
+
+ err = -ESTALE;
if (newdentry->d_inode)
- return -ESTALE;
+ goto out;
- if (hardlink) {
- err = ovl_do_link(hardlink, dir, newdentry, debug);
+ if (attr->hardlink) {
+ err = ovl_do_link(attr->hardlink, dir, newdentry);
} else {
switch (attr->mode & S_IFMT) {
case S_IFREG:
- err = ovl_do_create(dir, newdentry, attr->mode, debug);
+ err = ovl_do_create(dir, newdentry, attr->mode);
break;
case S_IFDIR:
- err = ovl_do_mkdir(dir, newdentry, attr->mode, debug);
+ /* mkdir is special... */
+ err = ovl_mkdir_real(dir, &newdentry, attr->mode);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
- err = ovl_do_mknod(dir, newdentry,
- attr->mode, attr->rdev, debug);
+ err = ovl_do_mknod(dir, newdentry, attr->mode,
+ attr->rdev);
break;
case S_IFLNK:
- err = ovl_do_symlink(dir, newdentry, attr->link, debug);
+ err = ovl_do_symlink(dir, newdentry, attr->link);
break;
default:
@@ -155,9 +191,20 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
- err = -ENOENT;
+ err = -EIO;
}
- return err;
+out:
+ if (err) {
+ dput(newdentry);
+ return ERR_PTR(err);
+ }
+ return newdentry;
+}
+
+struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr)
+{
+ return ovl_create_real(d_inode(workdir), ovl_lookup_temp(workdir),
+ attr);
}
static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
@@ -182,24 +229,54 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
return ovl_set_opaque_xerr(dentry, upperdentry, -EIO);
}
-/* Common operations required to be done after creation of file on upper */
-static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
- struct dentry *newdentry, bool hardlink)
+/*
+ * Common operations required to be done after creation of file on upper.
+ * If @hardlink is false, then @inode is a pre-allocated inode, we may or
+ * may not use to instantiate the new dentry.
+ */
+static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
+ struct dentry *newdentry, bool hardlink)
{
+ struct ovl_inode_params oip = {
+ .upperdentry = newdentry,
+ .newinode = inode,
+ };
+
ovl_dentry_version_inc(dentry->d_parent, false);
ovl_dentry_set_upper_alias(dentry);
if (!hardlink) {
- ovl_inode_update(inode, newdentry);
- ovl_copyattr(newdentry->d_inode, inode);
+ /*
+ * ovl_obtain_alias() can be called after ovl_create_real()
+ * and before we get here, so we may get an inode from cache
+ * with the same real upperdentry that is not the inode we
+ * pre-allocated. In this case we will use the cached inode
+ * to instantiate the new dentry.
+ *
+ * XXX: if we ever use ovl_obtain_alias() to decode directory
+ * file handles, need to use ovl_get_inode_locked() and
+ * d_instantiate_new() here to prevent from creating two
+ * hashed directory inode aliases.
+ */
+ inode = ovl_get_inode(dentry->d_sb, &oip);
+ if (WARN_ON(IS_ERR(inode)))
+ return PTR_ERR(inode);
} else {
WARN_ON(ovl_inode_real(inode) != d_inode(newdentry));
dput(newdentry);
inc_nlink(inode);
}
+
d_instantiate(dentry, inode);
+ if (inode != oip.newinode) {
+ pr_warn_ratelimited("overlayfs: newly created inode found in cache (%pd2)\n",
+ dentry);
+ }
+
/* Force lookup of new upper hardlink to find its lower */
if (hardlink)
d_drop(dentry);
+
+ return 0;
}
static bool ovl_type_merge(struct dentry *dentry)
@@ -213,38 +290,42 @@ static bool ovl_type_origin(struct dentry *dentry)
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
- struct cattr *attr, struct dentry *hardlink)
+ struct ovl_cattr *attr)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
- if (!hardlink && !IS_POSIXACL(udir))
+ if (!attr->hardlink && !IS_POSIXACL(udir))
attr->mode &= ~current_umask();
inode_lock_nested(udir, I_MUTEX_PARENT);
- newdentry = lookup_one_len(dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ newdentry = ovl_create_real(udir,
+ lookup_one_len(dentry->d_name.name,
+ upperdir,
+ dentry->d_name.len),
+ attr);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
- err = ovl_create_real(udir, newdentry, attr, hardlink, false);
- if (err)
- goto out_dput;
if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) {
/* Setting opaque here is just an optimization, allow to fail */
ovl_set_opaque(dentry, newdentry);
}
- ovl_instantiate(dentry, inode, newdentry, !!hardlink);
- newdentry = NULL;
-out_dput:
- dput(newdentry);
+ err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink);
+ if (err)
+ goto out_cleanup;
out_unlock:
inode_unlock(udir);
return err;
+
+out_cleanup:
+ ovl_cleanup(udir, newdentry);
+ dput(newdentry);
+ goto out_unlock;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
@@ -280,16 +361,11 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (upper->d_parent->d_inode != udir)
goto out_unlock;
- opaquedir = ovl_lookup_temp(workdir);
+ opaquedir = ovl_create_temp(workdir, OVL_CATTR(stat.mode));
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
- err = ovl_create_real(wdir, opaquedir,
- &(struct cattr){.mode = stat.mode}, NULL, true);
- if (err)
- goto out_dput;
-
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
@@ -319,7 +395,6 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
out_cleanup:
ovl_cleanup(wdir, opaquedir);
-out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
@@ -354,8 +429,7 @@ out_free:
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
- struct cattr *cattr,
- struct dentry *hardlink)
+ struct ovl_cattr *cattr)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
@@ -365,6 +439,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry;
int err;
struct posix_acl *acl, *default_acl;
+ bool hardlink = !!cattr->hardlink;
if (WARN_ON(!workdir))
return -EROFS;
@@ -380,20 +455,16 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out;
- newdentry = ovl_lookup_temp(workdir);
- err = PTR_ERR(newdentry);
- if (IS_ERR(newdentry))
- goto out_unlock;
-
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
- goto out_dput;
+ goto out_unlock;
- err = ovl_create_real(wdir, newdentry, cattr, hardlink, true);
- if (err)
- goto out_dput2;
+ newdentry = ovl_create_temp(workdir, cattr);
+ err = PTR_ERR(newdentry);
+ if (IS_ERR(newdentry))
+ goto out_dput;
/*
* mode could have been mutilated due to umask (e.g. sgid directory)
@@ -439,12 +510,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
}
- ovl_instantiate(dentry, inode, newdentry, !!hardlink);
- newdentry = NULL;
-out_dput2:
- dput(upper);
+ err = ovl_instantiate(dentry, inode, newdentry, hardlink);
+ if (err)
+ goto out_cleanup;
out_dput:
- dput(newdentry);
+ dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out:
@@ -456,12 +526,12 @@ out:
out_cleanup:
ovl_cleanup(wdir, newdentry);
- goto out_dput2;
+ dput(newdentry);
+ goto out_dput;
}
static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
- struct cattr *attr, struct dentry *hardlink,
- bool origin)
+ struct ovl_cattr *attr, bool origin)
{
int err;
const struct cred *old_cred;
@@ -489,7 +559,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
if (override_cred) {
override_cred->fsuid = inode->i_uid;
override_cred->fsgid = inode->i_gid;
- if (!hardlink) {
+ if (!attr->hardlink) {
err = security_dentry_create_files_as(dentry,
attr->mode, &dentry->d_name, old_cred,
override_cred);
@@ -502,21 +572,12 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
put_cred(override_cred);
if (!ovl_dentry_is_whiteout(dentry))
- err = ovl_create_upper(dentry, inode, attr,
- hardlink);
+ err = ovl_create_upper(dentry, inode, attr);
else
- err = ovl_create_over_whiteout(dentry, inode, attr,
- hardlink);
+ err = ovl_create_over_whiteout(dentry, inode, attr);
}
out_revert_creds:
revert_creds(old_cred);
- if (!err) {
- struct inode *realinode = d_inode(ovl_dentry_upper(dentry));
-
- WARN_ON(inode->i_mode != realinode->i_mode);
- WARN_ON(!uid_eq(inode->i_uid, realinode->i_uid));
- WARN_ON(!gid_eq(inode->i_gid, realinode->i_gid));
- }
return err;
}
@@ -525,7 +586,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
{
int err;
struct inode *inode;
- struct cattr attr = {
+ struct ovl_cattr attr = {
.rdev = rdev,
.link = link,
};
@@ -534,6 +595,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
if (err)
goto out;
+ /* Preallocate inode to be used by ovl_get_inode() */
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, rdev);
if (!inode)
@@ -542,8 +604,9 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
inode_init_owner(inode, dentry->d_parent->d_inode, mode);
attr.mode = inode->i_mode;
- err = ovl_create_or_link(dentry, inode, &attr, NULL, false);
- if (err)
+ err = ovl_create_or_link(dentry, inode, &attr, false);
+ /* Did we end up using the preallocated inode? */
+ if (inode != d_inode(dentry))
iput(inode);
out_drop_write:
@@ -601,8 +664,9 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
inode = d_inode(old);
ihold(inode);
- err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old),
- ovl_type_origin(old));
+ err = ovl_create_or_link(new, inode,
+ &(struct ovl_cattr) {.hardlink = ovl_dentry_upper(old)},
+ ovl_type_origin(old));
if (err)
iput(inode);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 425a94672300..9941ece61a14 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -300,12 +300,18 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
struct dentry *dentry;
struct inode *inode;
struct ovl_entry *oe;
+ struct ovl_inode_params oip = {
+ .lowerpath = lowerpath,
+ .index = index,
+ .numlower = !!lower
+ };
/* We get overlay directory dentries with ovl_lookup_real() */
if (d_is_dir(upper ?: lower))
return ERR_PTR(-EIO);
- inode = ovl_get_inode(sb, dget(upper), lowerpath, index, !!lower);
+ oip.upperdentry = dget(upper);
+ inode = ovl_get_inode(sb, &oip);
if (IS_ERR(inode)) {
dput(upper);
return ERR_CAST(inode);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 6e3815fb006b..1db5b3b458a1 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -749,15 +749,26 @@ static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
return true;
}
-struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
- struct ovl_path *lowerpath, struct dentry *index,
- unsigned int numlower)
+static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
+ struct inode *key)
{
+ return newinode ? inode_insert5(newinode, (unsigned long) key,
+ ovl_inode_test, ovl_inode_set, key) :
+ iget5_locked(sb, (unsigned long) key,
+ ovl_inode_test, ovl_inode_set, key);
+}
+
+struct inode *ovl_get_inode(struct super_block *sb,
+ struct ovl_inode_params *oip)
+{
+ struct dentry *upperdentry = oip->upperdentry;
+ struct ovl_path *lowerpath = oip->lowerpath;
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
struct inode *inode;
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
- bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index);
- int fsid = bylower ? lowerpath->layer->fsid : 0;
+ bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
+ oip->index);
+ int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
bool is_dir;
unsigned long ino = 0;
@@ -774,8 +785,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
upperdentry);
unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
- inode = iget5_locked(sb, (unsigned long) key,
- ovl_inode_test, ovl_inode_set, key);
+ inode = ovl_iget5(sb, oip->newinode, key);
if (!inode)
goto out_nomem;
if (!(inode->i_state & I_NEW)) {
@@ -811,12 +821,12 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
if (upperdentry && ovl_is_impuredir(upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
- if (index)
+ if (oip->index)
ovl_set_flag(OVL_INDEX, inode);
/* Check for non-merge dir that may have whiteouts */
if (is_dir) {
- if (((upperdentry && lowerdentry) || numlower > 1) ||
+ if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
ovl_set_flag(OVL_WHITEOUTS, inode);
}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 2dba29eadde6..08801b45df00 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -1004,8 +1004,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
upperdentry = dget(index);
if (upperdentry || ctr) {
- inode = ovl_get_inode(dentry->d_sb, upperdentry, stack, index,
- ctr);
+ struct ovl_inode_params oip = {
+ .upperdentry = upperdentry,
+ .lowerpath = stack,
+ .index = index,
+ .numlower = ctr,
+ };
+
+ inode = ovl_get_inode(dentry->d_sb, &oip);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_free_oe;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e0b7de799f6b..3c5e9f18b0d9 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -86,6 +86,7 @@ struct ovl_fh {
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
+
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
@@ -93,56 +94,52 @@ static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
+
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
- struct dentry *new_dentry, bool debug)
+ struct dentry *new_dentry)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
- if (debug) {
- pr_debug("link(%pd2, %pd2) = %i\n",
- old_dentry, new_dentry, err);
- }
+
+ pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
- umode_t mode, bool debug)
+ umode_t mode)
{
int err = vfs_create(dir, dentry, mode, true);
- if (debug)
- pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
+
+ pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
- umode_t mode, bool debug)
+ umode_t mode)
{
int err = vfs_mkdir(dir, dentry, mode);
- if (debug)
- pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t dev, bool debug)
+ umode_t mode, dev_t dev)
{
int err = vfs_mknod(dir, dentry, mode, dev);
- if (debug) {
- pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
- dentry, mode, dev, err);
- }
+
+ pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
- const char *oldname, bool debug)
+ const char *oldname)
{
int err = vfs_symlink(dir, dentry, oldname);
- if (debug)
- pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
+
+ pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
@@ -168,11 +165,8 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
{
int err;
- pr_debug("rename(%pd2, %pd2, 0x%x)\n",
- olddentry, newdentry, flags);
-
+ pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
-
if (err) {
pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
@@ -334,12 +328,18 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
bool ovl_is_private_xattr(const char *name);
+struct ovl_inode_params {
+ struct inode *newinode;
+ struct dentry *upperdentry;
+ struct ovl_path *lowerpath;
+ struct dentry *index;
+ unsigned int numlower;
+};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper);
-struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
- struct ovl_path *lowerpath, struct dentry *index,
- unsigned int numlower);
+struct inode *ovl_get_inode(struct super_block *sb,
+ struct ovl_inode_params *oip);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
@@ -352,18 +352,21 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to)
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
-struct dentry *ovl_lookup_temp(struct dentry *workdir);
int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir,
struct dentry *dentry);
-struct cattr {
+struct ovl_cattr {
dev_t rdev;
umode_t mode;
const char *link;
+ struct dentry *hardlink;
};
-int ovl_create_real(struct inode *dir, struct dentry *newdentry,
- struct cattr *attr,
- struct dentry *hardlink, bool debug);
+
+#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
+
+struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
+ struct ovl_cattr *attr);
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
+struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index e8551c97de51..704b37311467 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -611,11 +611,10 @@ retry:
goto retry;
}
- err = ovl_create_real(dir, work,
- &(struct cattr){.mode = S_IFDIR | 0},
- NULL, true);
- if (err)
- goto out_dput;
+ work = ovl_create_real(dir, work, OVL_CATTR(attr.ia_mode));
+ err = PTR_ERR(work);
+ if (IS_ERR(work))
+ goto out_err;
/*
* Try to remove POSIX ACL xattrs from workdir. We are good if:
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..0eaeb41453f5 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -43,6 +43,21 @@ config PROC_VMCORE
help
Exports the dump image of crashed kernel in ELF format.
+config PROC_VMCORE_DEVICE_DUMP
+ bool "Device Hardware/Firmware Log Collection"
+ depends on PROC_VMCORE
+ default n
+ help
+ After kernel panic, device drivers can collect the device
+ specific snapshot of their hardware or firmware before the
+ underlying devices are initialized in crash recovery kernel.
+ Note that the device driver must be present in the crash
+ recovery kernel's initramfs to collect its underlying device
+ snapshot.
+
+ If you say Y here, the collected device dumps will be added
+ as ELF notes to /proc/vmcore.
+
config PROC_SYSCTL
bool "Sysctl support (/proc/sys)" if EXPERT
depends on PROC_FS
diff --git a/fs/proc/array.c b/fs/proc/array.c
index e6d7f41b6684..0ceb3b6b37e7 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -96,22 +96,29 @@
#include <asm/processor.h>
#include "internal.h"
-static inline void task_name(struct seq_file *m, struct task_struct *p)
+void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape)
{
char *buf;
size_t size;
- char tcomm[sizeof(p->comm)];
+ char tcomm[64];
int ret;
- get_task_comm(tcomm, p);
-
- seq_puts(m, "Name:\t");
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_comm(tcomm, sizeof(tcomm), p);
+ else
+ __get_task_comm(tcomm, sizeof(tcomm), p);
size = seq_get_buf(m, &buf);
- ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
- seq_commit(m, ret < size ? ret : -1);
+ if (escape) {
+ ret = string_escape_str(tcomm, buf, size,
+ ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ if (ret >= size)
+ ret = -1;
+ } else {
+ ret = strscpy(buf, tcomm, size);
+ }
- seq_putc(m, '\n');
+ seq_commit(m, ret);
}
/*
@@ -261,7 +268,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
unsigned long flags;
sigset_t pending, shpending, blocked, ignored, caught;
int num_threads = 0;
- unsigned long qsize = 0;
+ unsigned int qsize = 0;
unsigned long qlim = 0;
sigemptyset(&pending);
@@ -390,7 +397,10 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
{
struct mm_struct *mm = get_task_mm(task);
- task_name(m, task);
+ seq_puts(m, "Name:\t");
+ proc_task_name(m, task, true);
+ seq_putc(m, '\n');
+
task_state(m, ns, pid, task);
if (mm) {
@@ -425,7 +435,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
u64 cutime, cstime, utime, stime;
u64 cgtime, gtime;
unsigned long rsslim = 0;
- char tcomm[sizeof(task->comm)];
unsigned long flags;
state = *get_task_state(task);
@@ -452,8 +461,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
}
}
- get_task_comm(tcomm, task);
-
sigemptyset(&sigign);
sigemptyset(&sigcatch);
cutime = cstime = utime = stime = 0;
@@ -520,7 +527,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_put_decimal_ull(m, "", pid_nr_ns(pid, ns));
seq_puts(m, " (");
- seq_puts(m, tcomm);
+ proc_task_name(m, task, false);
seq_puts(m, ") ");
seq_putc(m, state);
seq_put_decimal_ll(m, " ", ppid);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 33ed1746927a..4aa9ce5df02f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -205,171 +205,129 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
- size_t _count, loff_t *pos)
+static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
+ size_t count, loff_t *ppos)
{
- struct task_struct *tsk;
- struct mm_struct *mm;
- char *page;
- unsigned long count = _count;
unsigned long arg_start, arg_end, env_start, env_end;
- unsigned long len1, len2, len;
- unsigned long p;
- char c;
- ssize_t rv;
-
- BUG_ON(*pos < 0);
+ unsigned long pos, len;
+ char *page;
- tsk = get_proc_task(file_inode(file));
- if (!tsk)
- return -ESRCH;
- mm = get_task_mm(tsk);
- put_task_struct(tsk);
- if (!mm)
- return 0;
/* Check if process spawned far enough to have cmdline. */
- if (!mm->env_end) {
- rv = 0;
- goto out_mmput;
- }
-
- page = (char *)__get_free_page(GFP_KERNEL);
- if (!page) {
- rv = -ENOMEM;
- goto out_mmput;
- }
+ if (!mm->env_end)
+ return 0;
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
-
- BUG_ON(arg_start > arg_end);
- BUG_ON(env_start > env_end);
+ spin_unlock(&mm->arg_lock);
- len1 = arg_end - arg_start;
- len2 = env_end - env_start;
+ if (arg_start >= arg_end)
+ return 0;
- /* Empty ARGV. */
- if (len1 == 0) {
- rv = 0;
- goto out_free_page;
- }
/*
- * Inherently racy -- command line shares address space
- * with code and data.
+ * We have traditionally allowed the user to re-write
+ * the argument strings and overflow the end result
+ * into the environment section. But only do that if
+ * the environment area is contiguous to the arguments.
*/
- rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON);
- if (rv <= 0)
- goto out_free_page;
-
- rv = 0;
-
- if (c == '\0') {
- /* Command line (set of strings) occupies whole ARGV. */
- if (len1 <= *pos)
- goto out_free_page;
-
- p = arg_start + *pos;
- len = len1 - *pos;
- while (count > 0 && len > 0) {
- unsigned int _count;
- int nr_read;
-
- _count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
- if (nr_read < 0)
- rv = nr_read;
- if (nr_read <= 0)
- goto out_free_page;
-
- if (copy_to_user(buf, page, nr_read)) {
- rv = -EFAULT;
- goto out_free_page;
- }
+ if (env_start != arg_end || env_start >= env_end)
+ env_start = env_end = arg_end;
- p += nr_read;
- len -= nr_read;
- buf += nr_read;
- count -= nr_read;
- rv += nr_read;
- }
- } else {
- /*
- * Command line (1 string) occupies ARGV and
- * extends into ENVP.
- */
- struct {
- unsigned long p;
- unsigned long len;
- } cmdline[2] = {
- { .p = arg_start, .len = len1 },
- { .p = env_start, .len = len2 },
- };
- loff_t pos1 = *pos;
- unsigned int i;
+ /* We're not going to care if "*ppos" has high bits set */
+ pos = arg_start + *ppos;
+
+ /* .. but we do check the result is in the proper range */
+ if (pos < arg_start || pos >= env_end)
+ return 0;
+
+ /* .. and we never go past env_end */
+ if (env_end - pos < count)
+ count = env_end - pos;
+
+ page = (char *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ len = 0;
+ while (count) {
+ int got;
+ size_t size = min_t(size_t, PAGE_SIZE, count);
+
+ got = access_remote_vm(mm, pos, page, size, FOLL_ANON);
+ if (got <= 0)
+ break;
+
+ /* Don't walk past a NUL character once you hit arg_end */
+ if (pos + got >= arg_end) {
+ int n = 0;
- i = 0;
- while (i < 2 && pos1 >= cmdline[i].len) {
- pos1 -= cmdline[i].len;
- i++;
+ /*
+ * If we started before 'arg_end' but ended up
+ * at or after it, we start the NUL character
+ * check at arg_end-1 (where we expect the normal
+ * EOF to be).
+ *
+ * NOTE! This is smaller than 'got', because
+ * pos + got >= arg_end
+ */
+ if (pos < arg_end)
+ n = arg_end - pos - 1;
+
+ /* Cut off at first NUL after 'n' */
+ got = n + strnlen(page+n, got-n);
+ if (!got)
+ break;
}
- while (i < 2) {
- p = cmdline[i].p + pos1;
- len = cmdline[i].len - pos1;
- while (count > 0 && len > 0) {
- unsigned int _count, l;
- int nr_read;
- bool final;
-
- _count = min3(count, len, PAGE_SIZE);
- nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON);
- if (nr_read < 0)
- rv = nr_read;
- if (nr_read <= 0)
- goto out_free_page;
-
- /*
- * Command line can be shorter than whole ARGV
- * even if last "marker" byte says it is not.
- */
- final = false;
- l = strnlen(page, nr_read);
- if (l < nr_read) {
- nr_read = l;
- final = true;
- }
-
- if (copy_to_user(buf, page, nr_read)) {
- rv = -EFAULT;
- goto out_free_page;
- }
-
- p += nr_read;
- len -= nr_read;
- buf += nr_read;
- count -= nr_read;
- rv += nr_read;
-
- if (final)
- goto out_free_page;
- }
- /* Only first chunk can be read partially. */
- pos1 = 0;
- i++;
+ got -= copy_to_user(buf, page, got);
+ if (unlikely(!got)) {
+ if (!len)
+ len = -EFAULT;
+ break;
}
+ pos += got;
+ buf += got;
+ len += got;
+ count -= got;
}
-out_free_page:
free_page((unsigned long)page);
-out_mmput:
+ return len;
+}
+
+static ssize_t get_task_cmdline(struct task_struct *tsk, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct mm_struct *mm;
+ ssize_t ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = get_mm_cmdline(mm, buf, count, pos);
mmput(mm);
- if (rv > 0)
- *pos += rv;
- return rv;
+ return ret;
+}
+
+static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct task_struct *tsk;
+ ssize_t ret;
+
+ BUG_ON(*pos < 0);
+
+ tsk = get_proc_task(file_inode(file));
+ if (!tsk)
+ return -ESRCH;
+ ret = get_task_cmdline(tsk, buf, count, pos);
+ put_task_struct(tsk);
+ if (ret > 0)
+ *pos += ret;
+ return ret;
}
static const struct file_operations proc_pid_cmdline_ops = {
@@ -430,7 +388,6 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
struct stack_trace trace;
unsigned long *entries;
int err;
- int i;
entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
if (!entries)
@@ -443,6 +400,8 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
err = lock_trace(task);
if (!err) {
+ unsigned int i;
+
save_stack_trace_tsk(task, &trace);
for (i = 0; i < trace.nr_entries; i++) {
@@ -927,10 +886,10 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
while (count > 0) {
size_t this_len, max_len;
@@ -1563,9 +1522,8 @@ static int comm_show(struct seq_file *m, void *v)
if (!p)
return -ESRCH;
- task_lock(p);
- seq_printf(m, "%s\n", p->comm);
- task_unlock(p);
+ proc_task_name(m, p, false);
+ seq_putc(m, '\n');
put_task_struct(p);
@@ -1785,9 +1743,9 @@ int pid_getattr(const struct path *path, struct kstat *stat,
generic_fillattr(inode, stat);
- rcu_read_lock();
stat->uid = GLOBAL_ROOT_UID;
stat->gid = GLOBAL_ROOT_GID;
+ rcu_read_lock();
task = pid_task(proc_pid(inode), PIDTYPE_PID);
if (task) {
if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) {
@@ -1876,7 +1834,7 @@ const struct dentry_operations pid_dentry_operations =
* by stat.
*/
bool proc_fill_cache(struct file *file, struct dir_context *ctx,
- const char *name, int len,
+ const char *name, unsigned int len,
instantiate_t instantiate, struct task_struct *task, const void *ptr)
{
struct dentry *child, *dir = file->f_path.dentry;
@@ -1895,19 +1853,19 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
struct dentry *res;
res = instantiate(child, task, ptr);
d_lookup_done(child);
- if (IS_ERR(res))
- goto end_instantiate;
if (unlikely(res)) {
dput(child);
child = res;
+ if (IS_ERR(child))
+ goto end_instantiate;
}
}
}
inode = d_inode(child);
ino = inode->i_ino;
type = inode->i_mode >> 12;
-end_instantiate:
dput(child);
+end_instantiate:
return dir_emit(ctx, name, len, ino, type);
}
@@ -3252,7 +3210,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
iter.task;
iter.tgid += 1, iter = next_tgid(ns, iter)) {
char name[10 + 1];
- int len;
+ unsigned int len;
cond_resched();
if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE))
@@ -3579,7 +3537,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
task;
task = next_tid(task), ctx->pos++) {
char name[10 + 1];
- int len;
+ unsigned int len;
tid = task_pid_nr_ns(task, ns);
len = snprintf(name, sizeof(name), "%u", tid);
if (!proc_fill_cache(file, ctx, name, len,
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 05b9893e9a22..81882a13212d 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -248,7 +248,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
struct file *f;
struct fd_data data;
char name[10 + 1];
- int len;
+ unsigned int len;
f = fcheck_files(files, fd);
if (!f)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 43c70c9e6b62..50cb22a08c2f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -136,6 +136,8 @@ unsigned name_to_int(const struct qstr *qstr);
*/
extern const struct file_operations proc_tid_children_operations;
+extern void proc_task_name(struct seq_file *m, struct task_struct *p,
+ bool escape);
extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
struct pid *, struct task_struct *);
extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
@@ -161,7 +163,7 @@ extern loff_t mem_lseek(struct file *, loff_t, int);
/* Lookups */
typedef struct dentry *instantiate_t(struct dentry *,
struct task_struct *, const void *);
-extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int,
+bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int,
instantiate_t, struct task_struct *, const void *);
/*
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1491918a33c3..792c78a49174 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -154,6 +154,8 @@ u64 stable_page_flags(struct page *page)
if (PageBalloon(page))
u |= 1 << KPF_BALLOON;
+ if (PageTable(page))
+ u |= 1 << KPF_PGTABLE;
if (page_is_idle(page))
u |= 1 << KPF_IDLE;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a20c6e495bb2..597969db9e90 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -18,6 +18,7 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/uaccess.h>
+#include <linux/pkeys.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -673,13 +674,16 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_ARCH_HAS_PKEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
[ilog2(VM_PKEY_BIT3)] = "",
+#if VM_PKEY_BIT4
+ [ilog2(VM_PKEY_BIT4)] = "",
#endif
+#endif /* CONFIG_ARCH_HAS_PKEYS */
};
size_t i;
@@ -727,10 +731,6 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
}
#endif /* HUGETLB_PAGE */
-void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
-{
-}
-
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) >> 10, 8)
static int show_smap(struct seq_file *m, void *v, int is_pid)
@@ -835,7 +835,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
seq_puts(m, " kB\n");
}
if (!rollup_mode) {
- arch_show_smap(m, vma);
+ if (arch_pkeys_enabled())
+ seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
show_smap_vma_flags(m, vma);
}
m_cache_vma(m, vma);
@@ -1258,8 +1259,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pte_swp_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
entry = pte_to_swp_entry(pte);
- frame = swp_type(entry) |
- (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
+ if (pm->show_pfn)
+ frame = swp_type(entry) |
+ (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
@@ -1310,11 +1312,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
else if (is_swap_pmd(pmd)) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
- unsigned long offset = swp_offset(entry);
+ unsigned long offset;
- offset += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- frame = swp_type(entry) |
- (offset << MAX_SWAPFILES_SHIFT);
+ if (pm->show_pfn) {
+ offset = swp_offset(entry) +
+ ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ frame = swp_type(entry) |
+ (offset << MAX_SWAPFILES_SHIFT);
+ }
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
flags |= PM_SOFT_DIRTY;
@@ -1332,10 +1337,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
err = add_to_pagemap(addr, &pme, pm);
if (err)
break;
- if (pm->show_pfn && (flags & PM_PRESENT))
- frame++;
- else if (flags & PM_SWAP)
- frame += (1 << MAX_SWAPFILES_SHIFT);
+ if (pm->show_pfn) {
+ if (flags & PM_PRESENT)
+ frame++;
+ else if (flags & PM_SWAP)
+ frame += (1 << MAX_SWAPFILES_SHIFT);
+ }
}
spin_unlock(ptl);
return err;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a45f0af22a60..cfb6674331fd 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/crash_dump.h>
#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
@@ -38,12 +39,23 @@ static size_t elfcorebuf_sz_orig;
static char *elfnotes_buf;
static size_t elfnotes_sz;
+/* Size of all notes minus the device dump notes */
+static size_t elfnotes_orig_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
static struct proc_dir_entry *proc_vmcore;
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/* Device Dump list and mutex to synchronize access to list */
+static LIST_HEAD(vmcoredd_list);
+static DEFINE_MUTEX(vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Device Dump Size */
+static size_t vmcoredd_orig_sz;
+
/*
* Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
* The called function has to take care of module refcounting.
@@ -178,6 +190,77 @@ static int copy_to(void *target, void *src, size_t size, int userbuf)
return 0;
}
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
+{
+ struct vmcoredd_node *dump;
+ u64 offset = 0;
+ int ret = 0;
+ size_t tsz;
+ char *buf;
+
+ mutex_lock(&vmcoredd_mutex);
+ list_for_each_entry(dump, &vmcoredd_list, list) {
+ if (start < offset + dump->size) {
+ tsz = min(offset + (u64)dump->size - start, (u64)size);
+ buf = dump->buf + start - offset;
+ if (copy_to(dst, buf, tsz, userbuf)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ size -= tsz;
+ start += tsz;
+ dst += tsz;
+
+ /* Leave now if buffer filled already */
+ if (!size)
+ goto out_unlock;
+ }
+ offset += dump->size;
+ }
+
+out_unlock:
+ mutex_unlock(&vmcoredd_mutex);
+ return ret;
+}
+
+static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
+ u64 start, size_t size)
+{
+ struct vmcoredd_node *dump;
+ u64 offset = 0;
+ int ret = 0;
+ size_t tsz;
+ char *buf;
+
+ mutex_lock(&vmcoredd_mutex);
+ list_for_each_entry(dump, &vmcoredd_list, list) {
+ if (start < offset + dump->size) {
+ tsz = min(offset + (u64)dump->size - start, (u64)size);
+ buf = dump->buf + start - offset;
+ if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) {
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ size -= tsz;
+ start += tsz;
+ dst += tsz;
+
+ /* Leave now if buffer filled already */
+ if (!size)
+ goto out_unlock;
+ }
+ offset += dump->size;
+ }
+
+out_unlock:
+ mutex_unlock(&vmcoredd_mutex);
+ return ret;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
/* Read from the ELF header and then the crash dump. On error, negative value is
* returned otherwise number of bytes read are returned.
*/
@@ -215,10 +298,41 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
if (*fpos < elfcorebuf_sz + elfnotes_sz) {
void *kaddr;
+ /* We add device dumps before other elf notes because the
+ * other elf notes may not fill the elf notes buffer
+ * completely and we will end up with zero-filled data
+ * between the elf notes and the device dumps. Tools will
+ * then try to decode this zero-filled data as valid notes
+ * and we don't want that. Hence, adding device dumps before
+ * the other elf notes ensure that zero-filled data can be
+ * avoided.
+ */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ /* Read device dumps */
+ if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
+ tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+ (size_t)*fpos, buflen);
+ start = *fpos - elfcorebuf_sz;
+ if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf))
+ return -EFAULT;
+
+ buflen -= tsz;
+ *fpos += tsz;
+ buffer += tsz;
+ acc += tsz;
+
+ /* leave now if filled buffer already */
+ if (!buflen)
+ return acc;
+ }
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+ /* Read remaining elf notes */
tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
- kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+ kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
if (copy_to(buffer, kaddr, tsz, userbuf))
return -EFAULT;
+
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
@@ -302,10 +416,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
};
/**
- * alloc_elfnotes_buf - allocate buffer for ELF note segment in
- * vmalloc memory
- *
- * @notes_sz: size of buffer
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @sizez: size of buffer
*
* If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
* the buffer to user-space by means of remap_vmalloc_range().
@@ -313,12 +425,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
* If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
* disabled and there's no need to allow users to mmap the buffer.
*/
-static inline char *alloc_elfnotes_buf(size_t notes_sz)
+static inline char *vmcore_alloc_buf(size_t size)
{
#ifdef CONFIG_MMU
- return vmalloc_user(notes_sz);
+ return vmalloc_user(size);
#else
- return vzalloc(notes_sz);
+ return vzalloc(size);
#endif
}
@@ -446,11 +558,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
if (start < elfcorebuf_sz + elfnotes_sz) {
void *kaddr;
+ /* We add device dumps before other elf notes because the
+ * other elf notes may not fill the elf notes buffer
+ * completely and we will end up with zero-filled data
+ * between the elf notes and the device dumps. Tools will
+ * then try to decode this zero-filled data as valid notes
+ * and we don't want that. Hence, adding device dumps before
+ * the other elf notes ensure that zero-filled data can be
+ * avoided. This also ensures that the device dumps and
+ * other elf notes can be properly mmaped at page aligned
+ * address.
+ */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ /* Read device dumps */
+ if (start < elfcorebuf_sz + vmcoredd_orig_sz) {
+ u64 start_off;
+
+ tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+ (size_t)start, size);
+ start_off = start - elfcorebuf_sz;
+ if (vmcoredd_mmap_dumps(vma, vma->vm_start + len,
+ start_off, tsz))
+ goto fail;
+
+ size -= tsz;
+ start += tsz;
+ len += tsz;
+
+ /* leave now if filled buffer already */
+ if (!size)
+ return 0;
+ }
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+ /* Read remaining elf notes */
tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
- kaddr = elfnotes_buf + start - elfcorebuf_sz;
+ kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz;
if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
kaddr, tsz))
goto fail;
+
size -= tsz;
start += tsz;
len += tsz;
@@ -502,8 +649,8 @@ static struct vmcore* __init get_new_element(void)
return kzalloc(sizeof(struct vmcore), GFP_KERNEL);
}
-static u64 __init get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
- struct list_head *vc_list)
+static u64 get_vmcore_size(size_t elfsz, size_t elfnotesegsz,
+ struct list_head *vc_list)
{
u64 size;
struct vmcore *m;
@@ -665,7 +812,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
return rc;
*notes_sz = roundup(phdr_sz, PAGE_SIZE);
- *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ *notes_buf = vmcore_alloc_buf(*notes_sz);
if (!*notes_buf)
return -ENOMEM;
@@ -698,6 +845,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+ /* Store the size of all notes. We need this to update the note
+ * header when the device dumps will be added.
+ */
+ elfnotes_orig_sz = phdr.p_memsz;
+
return 0;
}
@@ -851,7 +1003,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
return rc;
*notes_sz = roundup(phdr_sz, PAGE_SIZE);
- *notes_buf = alloc_elfnotes_buf(*notes_sz);
+ *notes_buf = vmcore_alloc_buf(*notes_sz);
if (!*notes_buf)
return -ENOMEM;
@@ -884,6 +1036,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
/* Modify e_phnum to reflect merged headers. */
ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+ /* Store the size of all notes. We need this to update the note
+ * header when the device dumps will be added.
+ */
+ elfnotes_orig_sz = phdr.p_memsz;
+
return 0;
}
@@ -976,8 +1133,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
}
/* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
- struct list_head *vc_list)
+static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+ struct list_head *vc_list)
{
loff_t vmcore_off;
struct vmcore *m;
@@ -1145,6 +1302,202 @@ static int __init parse_crash_elf_headers(void)
return 0;
}
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/**
+ * vmcoredd_write_header - Write vmcore device dump header at the
+ * beginning of the dump's buffer.
+ * @buf: Output buffer where the note is written
+ * @data: Dump info
+ * @size: Size of the dump
+ *
+ * Fills beginning of the dump's buffer with vmcore device dump header.
+ */
+static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
+ u32 size)
+{
+ struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf;
+
+ vdd_hdr->n_namesz = sizeof(vdd_hdr->name);
+ vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
+ vdd_hdr->n_type = NT_VMCOREDD;
+
+ strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
+ sizeof(vdd_hdr->name));
+ memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
+}
+
+/**
+ * vmcoredd_update_program_headers - Update all Elf program headers
+ * @elfptr: Pointer to elf header
+ * @elfnotesz: Size of elf notes aligned to page size
+ * @vmcoreddsz: Size of device dumps to be added to elf note header
+ *
+ * Determine type of Elf header (Elf64 or Elf32) and update the elf note size.
+ * Also update the offsets of all the program headers after the elf note header.
+ */
+static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
+ size_t vmcoreddsz)
+{
+ unsigned char *e_ident = (unsigned char *)elfptr;
+ u64 start, end, size;
+ loff_t vmcore_off;
+ u32 i;
+
+ vmcore_off = elfcorebuf_sz + elfnotesz;
+
+ if (e_ident[EI_CLASS] == ELFCLASS64) {
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr;
+ Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr));
+
+ /* Update all program headers */
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE) {
+ /* Update note size */
+ phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+ phdr->p_filesz = phdr->p_memsz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz,
+ PAGE_SIZE);
+ size = end - start;
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off += size;
+ }
+ } else {
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr;
+ Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr));
+
+ /* Update all program headers */
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (phdr->p_type == PT_NOTE) {
+ /* Update note size */
+ phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+ phdr->p_filesz = phdr->p_memsz;
+ continue;
+ }
+
+ start = rounddown(phdr->p_offset, PAGE_SIZE);
+ end = roundup(phdr->p_offset + phdr->p_memsz,
+ PAGE_SIZE);
+ size = end - start;
+ phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+ vmcore_off += size;
+ }
+ }
+}
+
+/**
+ * vmcoredd_update_size - Update the total size of the device dumps and update
+ * Elf header
+ * @dump_size: Size of the current device dump to be added to total size
+ *
+ * Update the total size of all the device dumps and update the Elf program
+ * headers. Calculate the new offsets for the vmcore list and update the
+ * total vmcore size.
+ */
+static void vmcoredd_update_size(size_t dump_size)
+{
+ vmcoredd_orig_sz += dump_size;
+ elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz;
+ vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz,
+ vmcoredd_orig_sz);
+
+ /* Update vmcore list offsets */
+ set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+
+ vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+ &vmcore_list);
+ proc_vmcore->size = vmcore_size;
+}
+
+/**
+ * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
+ * @data: dump info.
+ *
+ * Allocate a buffer and invoke the calling driver's dump collect routine.
+ * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * dump and add the dump to global list.
+ */
+int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+ struct vmcoredd_node *dump;
+ void *buf = NULL;
+ size_t data_size;
+ int ret;
+
+ if (!data || !strlen(data->dump_name) ||
+ !data->vmcoredd_callback || !data->size)
+ return -EINVAL;
+
+ dump = vzalloc(sizeof(*dump));
+ if (!dump) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ /* Keep size of the buffer page aligned so that it can be mmaped */
+ data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
+ PAGE_SIZE);
+
+ /* Allocate buffer for driver's to write their dumps */
+ buf = vmcore_alloc_buf(data_size);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ vmcoredd_write_header(buf, data, data_size -
+ sizeof(struct vmcoredd_header));
+
+ /* Invoke the driver's dump collection routing */
+ ret = data->vmcoredd_callback(data, buf +
+ sizeof(struct vmcoredd_header));
+ if (ret)
+ goto out_err;
+
+ dump->buf = buf;
+ dump->size = data_size;
+
+ /* Add the dump to driver sysfs list */
+ mutex_lock(&vmcoredd_mutex);
+ list_add_tail(&dump->list, &vmcoredd_list);
+ mutex_unlock(&vmcoredd_mutex);
+
+ vmcoredd_update_size(data_size);
+ return 0;
+
+out_err:
+ if (buf)
+ vfree(buf);
+
+ if (dump)
+ vfree(dump);
+
+ return ret;
+}
+EXPORT_SYMBOL(vmcore_add_device_dump);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Free all dumps in vmcore device dump list */
+static void vmcore_free_device_dumps(void)
+{
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+ mutex_lock(&vmcoredd_mutex);
+ while (!list_empty(&vmcoredd_list)) {
+ struct vmcoredd_node *dump;
+
+ dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
+ list);
+ list_del(&dump->list);
+ vfree(dump->buf);
+ vfree(dump);
+ }
+ mutex_unlock(&vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+}
+
/* Init function for vmcore module. */
static int __init vmcore_init(void)
{
@@ -1192,4 +1545,7 @@ void vmcore_cleanup(void)
kfree(m);
}
free_elfcorebuf();
+
+ /* clear vmcore device dump list */
+ vmcore_free_device_dumps();
}
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 616a688f5d8f..55c508fe8131 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -24,14 +24,6 @@ static bool ubifs_crypt_empty_dir(struct inode *inode)
return ubifs_check_dir_empty(inode) == 0;
}
-static unsigned int ubifs_crypt_max_namelen(struct inode *inode)
-{
- if (S_ISLNK(inode->i_mode))
- return UBIFS_MAX_INO_DATA;
- else
- return UBIFS_MAX_NLEN;
-}
-
int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
unsigned int in_len, unsigned int *out_len, int block)
{
@@ -89,5 +81,5 @@ const struct fscrypt_operations ubifs_crypt_operations = {
.get_context = ubifs_crypt_get_context,
.set_context = ubifs_crypt_set_context,
.empty_dir = ubifs_crypt_empty_dir,
- .max_namelen = ubifs_crypt_max_namelen,
+ .max_namelen = UBIFS_MAX_NLEN,
};
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index c6e17a744c3b..aa415054ad0a 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -1,6 +1,7 @@
config UDF_FS
tristate "UDF file system support"
select CRC_ITU_T
+ select NLS
help
This is a file system used on some CD-ROMs and DVDs. Since the
file system is supported by multiple operating systems and is more
@@ -13,8 +14,3 @@ config UDF_FS
module will be called udf.
If unsure, say N.
-
-config UDF_NLS
- bool
- default y
- depends on (UDF_FS=m && NLS) || (UDF_FS=y && NLS=y)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7949c338efa5..0d27d41f5c6e 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -572,7 +572,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
case Opt_utf8:
uopt->flags |= (1 << UDF_FLAG_UTF8);
break;
-#ifdef CONFIG_UDF_NLS
case Opt_iocharset:
if (!remount) {
if (uopt->nls_map)
@@ -581,7 +580,6 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
uopt->flags |= (1 << UDF_FLAG_NLS_MAP);
}
break;
-#endif
case Opt_uforget:
uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
break;
@@ -892,14 +890,14 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
#endif
}
- ret = udf_dstrCS0toUTF8(outstr, 31, pvoldesc->volIdent, 32);
+ ret = udf_dstrCS0toChar(sb, outstr, 31, pvoldesc->volIdent, 32);
if (ret < 0)
goto out_bh;
strncpy(UDF_SB(sb)->s_volume_ident, outstr, ret);
udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident);
- ret = udf_dstrCS0toUTF8(outstr, 127, pvoldesc->volSetIdent, 128);
+ ret = udf_dstrCS0toChar(sb, outstr, 127, pvoldesc->volSetIdent, 128);
if (ret < 0)
goto out_bh;
@@ -2117,7 +2115,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
udf_err(sb, "utf8 cannot be combined with iocharset\n");
goto parse_options_failure;
}
-#ifdef CONFIG_UDF_NLS
if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) {
uopt.nls_map = load_nls_default();
if (!uopt.nls_map)
@@ -2125,7 +2122,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
else
udf_debug("Using default NLS map\n");
}
-#endif
if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP)))
uopt.flags |= (1 << UDF_FLAG_UTF8);
@@ -2279,10 +2275,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
error_out:
iput(sbi->s_vat_inode);
parse_options_failure:
-#ifdef CONFIG_UDF_NLS
if (uopt.nls_map)
unload_nls(uopt.nls_map);
-#endif
if (lvid_open)
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
@@ -2332,10 +2326,8 @@ static void udf_put_super(struct super_block *sb)
sbi = UDF_SB(sb);
iput(sbi->s_vat_inode);
-#ifdef CONFIG_UDF_NLS
if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
unload_nls(sbi->s_nls_map);
-#endif
if (!sb_rdonly(sb))
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 68e8a64d22e0..fc8d1b3384d2 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -220,7 +220,8 @@ extern int udf_get_filename(struct super_block *, const uint8_t *, int,
uint8_t *, int);
extern int udf_put_filename(struct super_block *, const uint8_t *, int,
uint8_t *, int);
-extern int udf_dstrCS0toUTF8(uint8_t *, int, const uint8_t *, int);
+extern int udf_dstrCS0toChar(struct super_block *, uint8_t *, int,
+ const uint8_t *, int);
/* ialloc.c */
extern void udf_free_inode(struct inode *);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 16a8ad21b77e..45234791fec2 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,101 +28,64 @@
#include "udf_sb.h"
+#define PLANE_SIZE 0x10000
+#define UNICODE_MAX 0x10ffff
#define SURROGATE_MASK 0xfffff800
#define SURROGATE_PAIR 0x0000d800
+#define SURROGATE_LOW 0x00000400
+#define SURROGATE_CHAR_BITS 10
+#define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
-static int udf_uni2char_utf8(wchar_t uni,
- unsigned char *out,
- int boundlen)
-{
- int u_len = 0;
-
- if (boundlen <= 0)
- return -ENAMETOOLONG;
+#define ILLEGAL_CHAR_MARK '_'
+#define EXT_MARK '.'
+#define CRC_MARK '#'
+#define EXT_SIZE 5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN 5
- if ((uni & SURROGATE_MASK) == SURROGATE_PAIR)
- return -EINVAL;
+static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len,
+ int str_i_idx, int u_ch, unicode_t *ret)
+{
+ unicode_t c;
+ int start_idx = str_i_idx;
+
+ /* Expand OSTA compressed Unicode to Unicode */
+ c = str_i[str_i_idx++];
+ if (u_ch > 1)
+ c = (c << 8) | str_i[str_i_idx++];
+ if ((c & SURROGATE_MASK) == SURROGATE_PAIR) {
+ unicode_t next;
+
+ /* Trailing surrogate char */
+ if (str_i_idx >= str_i_max_len) {
+ c = UNICODE_MAX + 1;
+ goto out;
+ }
- if (uni < 0x80) {
- out[u_len++] = (unsigned char)uni;
- } else if (uni < 0x800) {
- if (boundlen < 2)
- return -ENAMETOOLONG;
- out[u_len++] = (unsigned char)(0xc0 | (uni >> 6));
- out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
- } else {
- if (boundlen < 3)
- return -ENAMETOOLONG;
- out[u_len++] = (unsigned char)(0xe0 | (uni >> 12));
- out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f));
- out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f));
- }
- return u_len;
-}
+ /* Low surrogate must follow the high one... */
+ if (c & SURROGATE_LOW) {
+ c = UNICODE_MAX + 1;
+ goto out;
+ }
-static int udf_char2uni_utf8(const unsigned char *in,
- int boundlen,
- wchar_t *uni)
-{
- unsigned int utf_char;
- unsigned char c;
- int utf_cnt, u_len;
-
- utf_char = 0;
- utf_cnt = 0;
- for (u_len = 0; u_len < boundlen;) {
- c = in[u_len++];
-
- /* Complete a multi-byte UTF-8 character */
- if (utf_cnt) {
- utf_char = (utf_char << 6) | (c & 0x3f);
- if (--utf_cnt)
- continue;
- } else {
- /* Check for a multi-byte UTF-8 character */
- if (c & 0x80) {
- /* Start a multi-byte UTF-8 character */
- if ((c & 0xe0) == 0xc0) {
- utf_char = c & 0x1f;
- utf_cnt = 1;
- } else if ((c & 0xf0) == 0xe0) {
- utf_char = c & 0x0f;
- utf_cnt = 2;
- } else if ((c & 0xf8) == 0xf0) {
- utf_char = c & 0x07;
- utf_cnt = 3;
- } else if ((c & 0xfc) == 0xf8) {
- utf_char = c & 0x03;
- utf_cnt = 4;
- } else if ((c & 0xfe) == 0xfc) {
- utf_char = c & 0x01;
- utf_cnt = 5;
- } else {
- utf_cnt = -1;
- break;
- }
- continue;
- } else {
- /* Single byte UTF-8 character (most common) */
- utf_char = c;
- }
+ WARN_ON_ONCE(u_ch != 2);
+ next = str_i[str_i_idx++] << 8;
+ next |= str_i[str_i_idx++];
+ if ((next & SURROGATE_MASK) != SURROGATE_PAIR ||
+ !(next & SURROGATE_LOW)) {
+ c = UNICODE_MAX + 1;
+ goto out;
}
- *uni = utf_char;
- break;
- }
- if (utf_cnt) {
- *uni = '?';
- return -EINVAL;
+
+ c = PLANE_SIZE +
+ ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) +
+ (next & SURROGATE_CHAR_MASK);
}
- return u_len;
+out:
+ *ret = c;
+ return str_i_idx - start_idx;
}
-#define ILLEGAL_CHAR_MARK '_'
-#define EXT_MARK '.'
-#define CRC_MARK '#'
-#define EXT_SIZE 5
-/* Number of chars we need to store generated CRC to make filename unique */
-#define CRC_LEN 5
static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
int *str_o_idx,
@@ -132,27 +95,29 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
int (*conv_f)(wchar_t, unsigned char *, int),
int translate)
{
- uint32_t c;
+ unicode_t c;
int illChar = 0;
int len, gotch = 0;
- for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) {
+ while (!gotch && *str_i_idx < str_i_max_len) {
if (*str_o_idx >= str_o_max_len) {
*needsCRC = 1;
return gotch;
}
- /* Expand OSTA compressed Unicode to Unicode */
- c = str_i[*str_i_idx];
- if (u_ch > 1)
- c = (c << 8) | str_i[*str_i_idx + 1];
-
- if (translate && (c == '/' || c == 0))
+ len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch,
+ &c);
+ /* These chars cannot be converted. Replace them. */
+ if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) ||
+ (translate && c == '/')) {
illChar = 1;
- else if (illChar)
+ if (!translate)
+ gotch = 1;
+ } else if (illChar)
break;
else
gotch = 1;
+ *str_i_idx += len;
}
if (illChar) {
*needsCRC = 1;
@@ -160,7 +125,15 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
gotch = 1;
}
if (gotch) {
- len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx);
+ if (conv_f) {
+ len = conv_f(c, &str_o[*str_o_idx],
+ str_o_max_len - *str_o_idx);
+ } else {
+ len = utf32_to_utf8(c, &str_o[*str_o_idx],
+ str_o_max_len - *str_o_idx);
+ if (len < 0)
+ len = -ENAMETOOLONG;
+ }
/* Valid character? */
if (len >= 0)
*str_o_idx += len;
@@ -168,16 +141,16 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
*needsCRC = 1;
gotch = 0;
} else {
- str_o[(*str_o_idx)++] = '?';
+ str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK;
*needsCRC = 1;
}
}
return gotch;
}
-static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
+static int udf_name_from_CS0(struct super_block *sb,
+ uint8_t *str_o, int str_max_len,
const uint8_t *ocu, int ocu_len,
- int (*conv_f)(wchar_t, unsigned char *, int),
int translate)
{
uint32_t c;
@@ -194,6 +167,7 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
unsigned short valueCRC;
uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
uint8_t crc[CRC_LEN];
+ int (*conv_f)(wchar_t, unsigned char *, int);
if (str_max_len <= 0)
return 0;
@@ -203,6 +177,11 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
return 0;
}
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+ conv_f = UDF_SB(sb)->s_nls_map->uni2char;
+ else
+ conv_f = NULL;
+
cmp_id = ocu[0];
if (cmp_id != 8 && cmp_id != 16) {
memset(str_o, 0, str_max_len);
@@ -293,18 +272,24 @@ static int udf_name_from_CS0(uint8_t *str_o, int str_max_len,
return str_o_len;
}
-static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
- const uint8_t *str_i, int str_len,
- int (*conv_f)(const unsigned char *, int, wchar_t *))
+static int udf_name_to_CS0(struct super_block *sb,
+ uint8_t *ocu, int ocu_max_len,
+ const uint8_t *str_i, int str_len)
{
int i, len;
unsigned int max_val;
- wchar_t uni_char;
int u_len, u_ch;
+ unicode_t uni_char;
+ int (*conv_f)(const unsigned char *, int, wchar_t *);
if (ocu_max_len <= 0)
return 0;
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
+ conv_f = UDF_SB(sb)->s_nls_map->char2uni;
+ else
+ conv_f = NULL;
+
memset(ocu, 0, ocu_max_len);
ocu[0] = 8;
max_val = 0xff;
@@ -312,36 +297,61 @@ static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len,
try_again:
u_len = 1;
- for (i = 0; i < str_len; i++) {
+ for (i = 0; i < str_len; i += len) {
/* Name didn't fit? */
if (u_len + u_ch > ocu_max_len)
return 0;
- len = conv_f(&str_i[i], str_len - i, &uni_char);
- if (!len)
- continue;
+ if (conv_f) {
+ wchar_t wchar;
+
+ len = conv_f(&str_i[i], str_len - i, &wchar);
+ if (len > 0)
+ uni_char = wchar;
+ } else {
+ len = utf8_to_utf32(&str_i[i], str_len - i,
+ &uni_char);
+ }
/* Invalid character, deal with it */
- if (len < 0) {
+ if (len <= 0 || uni_char > UNICODE_MAX) {
len = 1;
uni_char = '?';
}
if (uni_char > max_val) {
- max_val = 0xffff;
- ocu[0] = 0x10;
- u_ch = 2;
- goto try_again;
+ unicode_t c;
+
+ if (max_val == 0xff) {
+ max_val = 0xffff;
+ ocu[0] = 0x10;
+ u_ch = 2;
+ goto try_again;
+ }
+ /*
+ * Use UTF-16 encoding for chars outside we
+ * cannot encode directly.
+ */
+ if (u_len + 2 * u_ch > ocu_max_len)
+ return 0;
+
+ uni_char -= PLANE_SIZE;
+ c = SURROGATE_PAIR |
+ ((uni_char >> SURROGATE_CHAR_BITS) &
+ SURROGATE_CHAR_MASK);
+ ocu[u_len++] = (uint8_t)(c >> 8);
+ ocu[u_len++] = (uint8_t)(c & 0xff);
+ uni_char = SURROGATE_PAIR | SURROGATE_LOW |
+ (uni_char & SURROGATE_CHAR_MASK);
}
if (max_val == 0xffff)
ocu[u_len++] = (uint8_t)(uni_char >> 8);
ocu[u_len++] = (uint8_t)(uni_char & 0xff);
- i += len - 1;
}
return u_len;
}
-int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
+int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
const uint8_t *ocu_i, int i_len)
{
int s_len = 0;
@@ -355,14 +365,12 @@ int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len,
}
}
- return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len,
- udf_uni2char_utf8, 0);
+ return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0);
}
int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
uint8_t *dname, int dlen)
{
- int (*conv_f)(wchar_t, unsigned char *, int);
int ret;
if (!slen)
@@ -371,14 +379,7 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
if (dlen <= 0)
return 0;
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
- conv_f = udf_uni2char_utf8;
- } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
- conv_f = UDF_SB(sb)->s_nls_map->uni2char;
- } else
- BUG();
-
- ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1);
+ ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1);
/* Zero length filename isn't valid... */
if (ret == 0)
ret = -EINVAL;
@@ -388,15 +389,6 @@ int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
uint8_t *dname, int dlen)
{
- int (*conv_f)(const unsigned char *, int, wchar_t *);
-
- if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
- conv_f = udf_char2uni_utf8;
- } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
- conv_f = UDF_SB(sb)->s_nls_map->char2uni;
- } else
- BUG();
-
- return udf_name_to_CS0(dname, dlen, sname, slen, conv_f);
+ return udf_name_to_CS0(sb, dname, dlen, sname, slen);
}
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cec550c8468f..123bf7d516fc 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -62,6 +62,8 @@ struct userfaultfd_ctx {
enum userfaultfd_state state;
/* released */
bool released;
+ /* memory mappings are changing because of non-cooperative event */
+ bool mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
};
@@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
* already released.
*/
out:
+ WRITE_ONCE(ctx->mmap_changing, false);
userfaultfd_ctx_put(ctx);
}
@@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
ctx->released = false;
+ ctx->mmap_changing = false;
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
+ WRITE_ONCE(octx->mmap_changing, true);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
}
}
@@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
up_read(&mm->mmap_sem);
msg_init(&ewq.msg);
@@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
+ ret = -EAGAIN;
+ if (READ_ONCE(ctx->mmap_changing))
+ goto out;
+
ret = -EFAULT;
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
/* don't copy "copy" last field */
@@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len);
+ uffdio_copy.len, &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+ ret = -EAGAIN;
+ if (READ_ONCE(ctx->mmap_changing))
+ goto out;
+
ret = -EFAULT;
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
/* don't copy "zeropage" last field */
@@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (mmget_not_zero(ctx->mm)) {
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len);
+ uffdio_zeropage.range.len,
+ &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
ctx->features = 0;
ctx->state = UFFD_STATE_WAIT_API;
ctx->released = false;
+ ctx->mmap_changing = false;
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 46bcf0e649f5..457ac9f97377 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -85,6 +85,24 @@ config XFS_ONLINE_SCRUB
If unsure, say N.
+config XFS_ONLINE_REPAIR
+ bool "XFS online metadata repair support"
+ default n
+ depends on XFS_FS && XFS_ONLINE_SCRUB
+ help
+ If you say Y here you will be able to repair metadata on a
+ mounted XFS filesystem. This feature is intended to reduce
+ filesystem downtime by fixing minor problems before they cause the
+ filesystem to go down. However, it requires that the filesystem be
+ formatted with secondary metadata, such as reverse mappings and inode
+ parent pointers.
+
+ This feature is considered EXPERIMENTAL. Use with caution!
+
+ See the xfs_scrub man page in section 8 for additional information.
+
+ If unsure, say N.
+
config XFS_WARN
bool "XFS Verbose Warnings"
depends on XFS_FS && !XFS_DEBUG
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7ceb41a9786a..e8d67a443bd7 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -28,6 +28,7 @@ xfs-y += xfs_trace.o
# build the libxfs code first
xfs-y += $(addprefix libxfs/, \
+ xfs_ag.o \
xfs_alloc.o \
xfs_alloc_btree.o \
xfs_attr.o \
@@ -163,4 +164,12 @@ xfs-y += $(addprefix scrub/, \
xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o
xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
+
+# online repair
+ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
+xfs-y += $(addprefix scrub/, \
+ agheader_repair.o \
+ repair.o \
+ )
+endif
endif
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
new file mode 100644
index 000000000000..9345802c99f7
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -0,0 +1,464 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+
+static struct xfs_buf *
+xfs_get_aghdr_buf(
+ struct xfs_mount *mp,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ int flags,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ if (!bp)
+ return NULL;
+
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ bp->b_bn = blkno;
+ bp->b_maps[0].bm_bn = blkno;
+ bp->b_ops = ops;
+
+ return bp;
+}
+
+/*
+ * Generic btree root block init function
+ */
+static void
+xfs_btroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0);
+}
+
+/*
+ * Alloc btree root block init functions
+ */
+static void
+xfs_bnoroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_alloc_rec *arec;
+
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0);
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+ arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+ arec->ar_blockcount = cpu_to_be32(id->agsize -
+ be32_to_cpu(arec->ar_startblock));
+}
+
+static void
+xfs_cntroot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_alloc_rec *arec;
+
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0);
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
+ arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
+ arec->ar_blockcount = cpu_to_be32(id->agsize -
+ be32_to_cpu(arec->ar_startblock));
+}
+
+/*
+ * Reverse map root block init
+ */
+static void
+xfs_rmaproot_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_rmap_rec *rrec;
+
+ xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0);
+
+ /*
+ * mark the AG header regions as static metadata The BNO
+ * btree block is the first block after the headers, so
+ * it's location defines the size of region the static
+ * metadata consumes.
+ *
+ * Note: unlike mkfs, we never have to account for log
+ * space when growing the data regions
+ */
+ rrec = XFS_RMAP_REC_ADDR(block, 1);
+ rrec->rm_startblock = 0;
+ rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
+ rrec->rm_offset = 0;
+
+ /* account freespace btree root blocks */
+ rrec = XFS_RMAP_REC_ADDR(block, 2);
+ rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(2);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+ rrec->rm_offset = 0;
+
+ /* account inode btree root blocks */
+ rrec = XFS_RMAP_REC_ADDR(block, 3);
+ rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
+ XFS_IBT_BLOCK(mp));
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
+ rrec->rm_offset = 0;
+
+ /* account for rmap btree root */
+ rrec = XFS_RMAP_REC_ADDR(block, 4);
+ rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ rrec->rm_blockcount = cpu_to_be32(1);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
+ rrec->rm_offset = 0;
+
+ /* account for refc btree root */
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ rrec = XFS_RMAP_REC_ADDR(block, 5);
+ rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp));
+ rrec->rm_blockcount = cpu_to_be32(1);
+ rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+ rrec->rm_offset = 0;
+ be16_add_cpu(&block->bb_numrecs, 1);
+ }
+}
+
+/*
+ * Initialise new secondary superblocks with the pre-grow geometry, but mark
+ * them as "in progress" so we know they haven't yet been activated. This will
+ * get cleared when the update with the new geometry information is done after
+ * changes to the primary are committed. This isn't strictly necessary, but we
+ * get it for free with the delayed buffer write lists and it means we can tell
+ * if a grow operation didn't complete properly after the fact.
+ */
+static void
+xfs_sbblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
+
+ xfs_sb_to_disk(dsb, &mp->m_sb);
+ dsb->sb_inprogress = 1;
+}
+
+static void
+xfs_agfblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
+ xfs_extlen_t tmpsize;
+
+ agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
+ agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
+ agf->agf_seqno = cpu_to_be32(id->agno);
+ agf->agf_length = cpu_to_be32(id->agsize);
+ agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
+ agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
+ agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
+ agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ agf->agf_roots[XFS_BTNUM_RMAPi] =
+ cpu_to_be32(XFS_RMAP_BLOCK(mp));
+ agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_blocks = cpu_to_be32(1);
+ }
+
+ agf->agf_flfirst = cpu_to_be32(1);
+ agf->agf_fllast = 0;
+ agf->agf_flcount = 0;
+ tmpsize = id->agsize - mp->m_ag_prealloc_blocks;
+ agf->agf_freeblks = cpu_to_be32(tmpsize);
+ agf->agf_longest = cpu_to_be32(tmpsize);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+ if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+ agf->agf_refcount_root = cpu_to_be32(
+ xfs_refc_block(mp));
+ agf->agf_refcount_level = cpu_to_be32(1);
+ agf->agf_refcount_blocks = cpu_to_be32(1);
+ }
+}
+
+static void
+xfs_agflblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ __be32 *agfl_bno;
+ int bucket;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
+ agfl->agfl_seqno = cpu_to_be32(id->agno);
+ uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
+ }
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
+ for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
+ agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+}
+
+static void
+xfs_agiblock_init(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ struct aghdr_init_data *id)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ int bucket;
+
+ agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
+ agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
+ agi->agi_seqno = cpu_to_be32(id->agno);
+ agi->agi_length = cpu_to_be32(id->agsize);
+ agi->agi_count = 0;
+ agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
+ agi->agi_level = cpu_to_be32(1);
+ agi->agi_freecount = 0;
+ agi->agi_newino = cpu_to_be32(NULLAGINO);
+ agi->agi_dirino = cpu_to_be32(NULLAGINO);
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+ agi->agi_free_level = cpu_to_be32(1);
+ }
+ for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
+ agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
+}
+
+typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp,
+ struct aghdr_init_data *id);
+static int
+xfs_ag_init_hdr(
+ struct xfs_mount *mp,
+ struct aghdr_init_data *id,
+ aghdr_init_work_f work,
+ const struct xfs_buf_ops *ops)
+
+{
+ struct xfs_buf *bp;
+
+ bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops);
+ if (!bp)
+ return -ENOMEM;
+
+ (*work)(mp, bp, id);
+
+ xfs_buf_delwri_queue(bp, &id->buffer_list);
+ xfs_buf_relse(bp);
+ return 0;
+}
+
+struct xfs_aghdr_grow_data {
+ xfs_daddr_t daddr;
+ size_t numblks;
+ const struct xfs_buf_ops *ops;
+ aghdr_init_work_f work;
+ xfs_btnum_t type;
+ bool need_init;
+};
+
+/*
+ * Prepare new AG headers to be written to disk. We use uncached buffers here,
+ * as it is assumed these new AG headers are currently beyond the currently
+ * valid filesystem address space. Using cached buffers would trip over EOFS
+ * corruption detection alogrithms in the buffer cache lookup routines.
+ *
+ * This is a non-transactional function, but the prepared buffers are added to a
+ * delayed write buffer list supplied by the caller so they can submit them to
+ * disk and wait on them as required.
+ */
+int
+xfs_ag_init_headers(
+ struct xfs_mount *mp,
+ struct aghdr_init_data *id)
+
+{
+ struct xfs_aghdr_grow_data aghdr_data[] = {
+ { /* SB */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_sb_buf_ops,
+ .work = &xfs_sbblock_init,
+ .need_init = true
+ },
+ { /* AGF */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agf_buf_ops,
+ .work = &xfs_agfblock_init,
+ .need_init = true
+ },
+ { /* AGFL */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agfl_buf_ops,
+ .work = &xfs_agflblock_init,
+ .need_init = true
+ },
+ { /* AGI */
+ .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)),
+ .numblks = XFS_FSS_TO_BB(mp, 1),
+ .ops = &xfs_agi_buf_ops,
+ .work = &xfs_agiblock_init,
+ .need_init = true
+ },
+ { /* BNO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_allocbt_buf_ops,
+ .work = &xfs_bnoroot_init,
+ .need_init = true
+ },
+ { /* CNT root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_allocbt_buf_ops,
+ .work = &xfs_cntroot_init,
+ .need_init = true
+ },
+ { /* INO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_inobt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_INO,
+ .need_init = true
+ },
+ { /* FINO root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_inobt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_FINO,
+ .need_init = xfs_sb_version_hasfinobt(&mp->m_sb)
+ },
+ { /* RMAP root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_rmapbt_buf_ops,
+ .work = &xfs_rmaproot_init,
+ .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb)
+ },
+ { /* REFC root block */
+ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)),
+ .numblks = BTOBB(mp->m_sb.sb_blocksize),
+ .ops = &xfs_refcountbt_buf_ops,
+ .work = &xfs_btroot_init,
+ .type = XFS_BTNUM_REFC,
+ .need_init = xfs_sb_version_hasreflink(&mp->m_sb)
+ },
+ { /* NULL terminating block */
+ .daddr = XFS_BUF_DADDR_NULL,
+ }
+ };
+ struct xfs_aghdr_grow_data *dp;
+ int error = 0;
+
+ /* Account for AG free space in new AG */
+ id->nfree += id->agsize - mp->m_ag_prealloc_blocks;
+ for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) {
+ if (!dp->need_init)
+ continue;
+
+ id->daddr = dp->daddr;
+ id->numblks = dp->numblks;
+ id->type = dp->type;
+ error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
+ if (error)
+ break;
+ }
+ return error;
+}
+
+/*
+ * Extent the AG indicated by the @id by the length passed in
+ */
+int
+xfs_ag_extend_space(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct aghdr_init_data *id,
+ xfs_extlen_t len)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_buf *bp;
+ struct xfs_agi *agi;
+ struct xfs_agf *agf;
+ int error;
+
+ /*
+ * Change the agi length.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp);
+ if (error)
+ return error;
+
+ agi = XFS_BUF_TO_AGI(bp);
+ be32_add_cpu(&agi->agi_length, len);
+ ASSERT(id->agno == mp->m_sb.sb_agcount - 1 ||
+ be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
+ xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
+
+ /*
+ * Change agf length.
+ */
+ error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp);
+ if (error)
+ return error;
+
+ agf = XFS_BUF_TO_AGF(bp);
+ be32_add_cpu(&agf->agf_length, len);
+ ASSERT(agf->agf_length == agi->agi_length);
+ xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+
+ /*
+ * Free the new space.
+ *
+ * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
+ * this doesn't actually exist in the rmap btree.
+ */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
+ error = xfs_rmap_free(tp, bp, id->agno,
+ be32_to_cpu(agf->agf_length) - len,
+ len, &oinfo);
+ if (error)
+ return error;
+
+ return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno,
+ be32_to_cpu(agf->agf_length) - len),
+ len, &oinfo, XFS_AG_RESV_NONE);
+}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
new file mode 100644
index 000000000000..412702e23f61
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2018 Red Hat, Inc.
+ * All rights reserved.
+ */
+
+#ifndef __LIBXFS_AG_H
+#define __LIBXFS_AG_H 1
+
+struct xfs_mount;
+struct xfs_trans;
+
+struct aghdr_init_data {
+ /* per ag data */
+ xfs_agblock_t agno; /* ag to init */
+ xfs_extlen_t agsize; /* new AG size */
+ struct list_head buffer_list; /* buffer writeback list */
+ xfs_rfsblock_t nfree; /* cumulative new free space */
+
+ /* per header data */
+ xfs_daddr_t daddr; /* header location */
+ size_t numblks; /* size of header */
+ xfs_btnum_t type; /* type of btree root block */
+};
+
+int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
+int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp,
+ struct aghdr_init_data *id, xfs_extlen_t len);
+
+#endif /* __LIBXFS_AG_H */
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 4bcc095fe44a..dc9dd3805d97 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -39,6 +39,9 @@
#include "xfs_buf_item.h"
#include "xfs_log.h"
#include "xfs_ag_resv.h"
+#include "xfs_bmap.h"
+
+extern kmem_zone_t *xfs_bmap_free_item_zone;
struct workqueue_struct *xfs_alloc_wq;
@@ -2060,6 +2063,30 @@ xfs_alloc_space_available(
return true;
}
+int
+xfs_free_agfl_block(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ struct xfs_buf *agbp,
+ struct xfs_owner_info *oinfo)
+{
+ int error;
+ struct xfs_buf *bp;
+
+ error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
+ XFS_AG_RESV_AGFL);
+ if (error)
+ return error;
+
+ bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0);
+ if (!bp)
+ return -EFSCORRUPTED;
+ xfs_trans_binval(tp, bp);
+
+ return 0;
+}
+
/*
* Check the agfl fields of the agf for inconsistency or corruption. The purpose
* is to detect an agfl header padding mismatch between current and early v5
@@ -2148,6 +2175,40 @@ xfs_agfl_reset(
}
/*
+ * Defer an AGFL block free. This is effectively equivalent to
+ * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
+ *
+ * Deferring AGFL frees helps prevent log reservation overruns due to too many
+ * allocation operations in a transaction. AGFL frees are prone to this problem
+ * because for one they are always freed one at a time. Further, an immediate
+ * AGFL block free can cause a btree join and require another block free before
+ * the real allocation can proceed. Deferring the free disconnects freeing up
+ * the AGFL slot from freeing the block.
+ */
+STATIC void
+xfs_defer_agfl_block(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_agnumber_t agno,
+ xfs_fsblock_t agbno,
+ struct xfs_owner_info *oinfo)
+{
+ struct xfs_extent_free_item *new; /* new element */
+
+ ASSERT(xfs_bmap_free_item_zone != NULL);
+ ASSERT(oinfo != NULL);
+
+ new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
+ new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
+ new->xefi_blockcount = 1;
+ new->xefi_oinfo = *oinfo;
+
+ trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
+
+ xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
+}
+
+/*
* Decide whether to use this allocation group for this allocation.
* If so, fix up the btree freelist's size.
*/
@@ -2247,21 +2308,20 @@ xfs_alloc_fix_freelist(
else
xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG);
while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
- struct xfs_buf *bp;
-
error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
if (error)
goto out_agbp_relse;
- error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1,
- &targs.oinfo, XFS_AG_RESV_AGFL);
- if (error)
- goto out_agbp_relse;
- bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0);
- if (!bp) {
- error = -EFSCORRUPTED;
- goto out_agbp_relse;
+
+ /* defer agfl frees if dfops is provided */
+ if (tp->t_agfl_dfops) {
+ xfs_defer_agfl_block(mp, tp->t_agfl_dfops, args->agno,
+ bno, &targs.oinfo);
+ } else {
+ error = xfs_free_agfl_block(tp, args->agno, bno, agbp,
+ &targs.oinfo);
+ if (error)
+ goto out_agbp_relse;
}
- xfs_trans_binval(tp, bp);
}
targs.tp = tp;
@@ -2949,18 +3009,20 @@ out:
* after fixing up the freelist.
*/
int /* error */
-xfs_free_extent(
+__xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
struct xfs_owner_info *oinfo, /* extent owner */
- enum xfs_ag_resv_type type) /* block reservation type */
+ enum xfs_ag_resv_type type, /* block reservation type */
+ bool skip_discard)
{
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *agbp;
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
int error;
+ unsigned int busy_flags = 0;
ASSERT(len != 0);
ASSERT(type != XFS_AG_RESV_AGFL);
@@ -2984,7 +3046,9 @@ xfs_free_extent(
if (error)
goto err;
- xfs_extent_busy_insert(tp, agno, agbno, len, 0);
+ if (skip_discard)
+ busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
+ xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
return 0;
err:
@@ -3116,3 +3180,40 @@ xfs_alloc_has_record(
return xfs_btree_has_record(cur, &low, &high, exists);
}
+
+/*
+ * Walk all the blocks in the AGFL. The @walk_fn can return any negative
+ * error code or XFS_BTREE_QUERY_RANGE_ABORT.
+ */
+int
+xfs_agfl_walk(
+ struct xfs_mount *mp,
+ struct xfs_agf *agf,
+ struct xfs_buf *agflbp,
+ xfs_agfl_walk_fn walk_fn,
+ void *priv)
+{
+ __be32 *agfl_bno;
+ unsigned int i;
+ int error;
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
+ i = be32_to_cpu(agf->agf_flfirst);
+
+ /* Nothing to walk in an empty AGFL. */
+ if (agf->agf_flcount == cpu_to_be32(0))
+ return 0;
+
+ /* Otherwise, walk from first to last, wrapping as needed. */
+ for (;;) {
+ error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
+ if (error)
+ return error;
+ if (i == be32_to_cpu(agf->agf_fllast))
+ break;
+ if (++i == xfs_agfl_size(mp))
+ i = 0;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index cbf789ea5a4e..0747adcd57d6 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -191,12 +191,24 @@ xfs_alloc_vextent(
* Free an extent.
*/
int /* error */
-xfs_free_extent(
+__xfs_free_extent(
struct xfs_trans *tp, /* transaction pointer */
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len, /* length of extent */
struct xfs_owner_info *oinfo, /* extent owner */
- enum xfs_ag_resv_type type); /* block reservation type */
+ enum xfs_ag_resv_type type, /* block reservation type */
+ bool skip_discard);
+
+static inline int
+xfs_free_extent(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ return __xfs_free_extent(tp, bno, len, oinfo, type, false);
+}
int /* error */
xfs_alloc_lookup_le(
@@ -223,6 +235,8 @@ int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
+int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t,
+ struct xfs_buf *, struct xfs_owner_info *);
int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
struct xfs_buf **agbp);
@@ -248,4 +262,9 @@ bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno);
int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, bool *exist);
+typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno,
+ void *priv);
+int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf,
+ struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv);
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index b451649ba176..18aec7a0e599 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -547,3 +547,12 @@ xfs_allocbt_maxrecs(
return blocklen / sizeof(xfs_alloc_rec_t);
return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
}
+
+/* Calculate the freespace btree size for some records. */
+xfs_extlen_t
+xfs_allocbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_alloc_mnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45e189e7e81c..2fd54728871c 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -61,5 +61,7 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_buf *,
xfs_agnumber_t, xfs_btnum_t);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 35a124400d60..c3d02a66d39d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -236,7 +236,7 @@ xfs_attr_set(
args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
args.total = xfs_attr_calc_size(&args, &local);
- error = xfs_qm_dqattach(dp, 0);
+ error = xfs_qm_dqattach(dp);
if (error)
return error;
@@ -427,7 +427,7 @@ xfs_attr_remove(
*/
args.op_flags = XFS_DA_OP_OKNOENT;
- error = xfs_qm_dqattach(dp, 0);
+ error = xfs_qm_dqattach(dp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 21be186067a2..83a6d3c7f872 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -620,7 +620,7 @@ xfs_attr_rmtval_remove(
/*
* If the "remote" value is in the cache, remove it.
*/
- bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
+ bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
if (bp) {
xfs_buf_stale(bp);
xfs_buf_relse(bp);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 040eeda8426f..7b0e2b551e23 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -246,7 +246,7 @@ xfs_bmap_get_bp(
struct xfs_btree_cur *cur,
xfs_fsblock_t bno)
{
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
int i;
if (!cur)
@@ -260,9 +260,9 @@ xfs_bmap_get_bp(
}
/* Chase down all the log items to see if the bp is there */
- list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) {
- struct xfs_buf_log_item *bip;
- bip = (struct xfs_buf_log_item *)lidp->lid_item;
+ list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) {
+ struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip;
+
if (bip->bli_item.li_type == XFS_LI_BUF &&
XFS_BUF_ADDR(bip->bli_buf) == bno)
return bip->bli_buf;
@@ -312,8 +312,9 @@ xfs_check_block(
xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
__func__, j, i,
(unsigned long long)be64_to_cpu(*thispa));
- panic("%s: ptrs are equal in node\n",
+ xfs_err(mp, "%s: ptrs are equal in node\n",
__func__);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
}
}
@@ -483,7 +484,8 @@ error0:
error_norelse:
xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
__func__, i);
- panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return;
}
@@ -542,12 +544,13 @@ xfs_bmap_validate_ret(
* The list is maintained sorted (by block number).
*/
void
-xfs_bmap_add_free(
+__xfs_bmap_add_free(
struct xfs_mount *mp,
struct xfs_defer_ops *dfops,
xfs_fsblock_t bno,
xfs_filblks_t len,
- struct xfs_owner_info *oinfo)
+ struct xfs_owner_info *oinfo,
+ bool skip_discard)
{
struct xfs_extent_free_item *new; /* new element */
#ifdef DEBUG
@@ -574,6 +577,7 @@ xfs_bmap_add_free(
new->xefi_oinfo = *oinfo;
else
xfs_rmap_skip_owner_update(&new->xefi_oinfo);
+ new->xefi_skip_discard = skip_discard;
trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0,
XFS_FSB_TO_AGBNO(mp, bno), len);
xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
@@ -2001,10 +2005,13 @@ xfs_bmap_add_extent_delay_real(
ASSERT(0);
}
- /* add reverse mapping */
- error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
- if (error)
- goto done;
+ /* add reverse mapping unless caller opted out */
+ if (!(bma->flags & XFS_BMAPI_NORMAP)) {
+ error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip,
+ whichfork, new);
+ if (error)
+ goto done;
+ }
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
@@ -2668,7 +2675,8 @@ xfs_bmap_add_extent_hole_real(
struct xfs_bmbt_irec *new,
xfs_fsblock_t *first,
struct xfs_defer_ops *dfops,
- int *logflagsp)
+ int *logflagsp,
+ int flags)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
@@ -2845,10 +2853,12 @@ xfs_bmap_add_extent_hole_real(
break;
}
- /* add reverse mapping */
- error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
- if (error)
- goto done;
+ /* add reverse mapping unless caller opted out */
+ if (!(flags & XFS_BMAPI_NORMAP)) {
+ error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
+ if (error)
+ goto done;
+ }
/* convert to a btree if necessary */
if (xfs_bmap_needs_btree(ip, whichfork)) {
@@ -4123,7 +4133,8 @@ xfs_bmapi_allocate(
else
error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
whichfork, &bma->icur, &bma->cur, &bma->got,
- bma->firstblock, bma->dfops, &bma->logflags);
+ bma->firstblock, bma->dfops, &bma->logflags,
+ bma->flags);
bma->logflags |= tmp_logflags;
if (error)
@@ -4509,30 +4520,37 @@ error0:
return error;
}
-static int
+int
xfs_bmapi_remap(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t bno,
xfs_filblks_t len,
xfs_fsblock_t startblock,
- struct xfs_defer_ops *dfops)
+ struct xfs_defer_ops *dfops,
+ int flags)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ struct xfs_ifork *ifp;
struct xfs_btree_cur *cur = NULL;
xfs_fsblock_t firstblock = NULLFSBLOCK;
struct xfs_bmbt_irec got;
struct xfs_iext_cursor icur;
+ int whichfork = xfs_bmapi_whichfork(flags);
int logflags = 0, error;
+ ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(len > 0);
ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
+ XFS_BMAPI_NORMAP)));
+ ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
+ (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC));
if (unlikely(XFS_TEST_ERROR(
- (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
- XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
mp, XFS_ERRTAG_BMAPIFORMAT))) {
XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
@@ -4542,7 +4560,7 @@ xfs_bmapi_remap(
return -EIO;
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
}
@@ -4557,7 +4575,7 @@ xfs_bmapi_remap(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
if (ifp->if_flags & XFS_IFBROOT) {
- cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = firstblock;
cur->bc_private.b.dfops = dfops;
cur->bc_private.b.flags = 0;
@@ -4566,18 +4584,21 @@ xfs_bmapi_remap(
got.br_startoff = bno;
got.br_startblock = startblock;
got.br_blockcount = len;
- got.br_state = XFS_EXT_NORM;
+ if (flags & XFS_BMAPI_PREALLOC)
+ got.br_state = XFS_EXT_UNWRITTEN;
+ else
+ got.br_state = XFS_EXT_NORM;
- error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur,
- &cur, &got, &firstblock, dfops, &logflags);
+ error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur,
+ &cur, &got, &firstblock, dfops, &logflags, flags);
if (error)
goto error0;
- if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) {
+ if (xfs_bmap_wants_extents(ip, whichfork)) {
int tmp_logflags = 0;
error = xfs_bmap_btree_to_extents(tp, ip, cur,
- &tmp_logflags, XFS_DATA_FORK);
+ &tmp_logflags, whichfork);
logflags |= tmp_logflags;
}
@@ -5104,9 +5125,12 @@ xfs_bmap_del_extent_real(
error = xfs_refcount_decrease_extent(mp, dfops, del);
if (error)
goto done;
- } else
- xfs_bmap_add_free(mp, dfops, del->br_startblock,
- del->br_blockcount, NULL);
+ } else {
+ __xfs_bmap_add_free(mp, dfops, del->br_startblock,
+ del->br_blockcount, NULL,
+ (bflags & XFS_BMAPI_NODISCARD) ||
+ del->br_state == XFS_EXT_UNWRITTEN);
+ }
}
/*
@@ -6148,7 +6172,7 @@ xfs_bmap_finish_one(
switch (type) {
case XFS_BMAP_MAP:
error = xfs_bmapi_remap(tp, ip, startoff, *blockcount,
- startblock, dfops);
+ startblock, dfops, 0);
*blockcount = 0;
break;
case XFS_BMAP_UNMAP:
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 2b766b37096d..2c233f9f1a26 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -68,6 +68,7 @@ struct xfs_extent_free_item
xfs_extlen_t xefi_blockcount;/* number of blocks in extent */
struct list_head xefi_list;
struct xfs_owner_info xefi_oinfo; /* extent owner */
+ bool xefi_skip_discard;
};
#define XFS_BMAP_MAX_NMAP 4
@@ -116,6 +117,12 @@ struct xfs_extent_free_item
/* Only convert unwritten extents, don't allocate new blocks */
#define XFS_BMAPI_CONVERT_ONLY 0x800
+/* Skip online discard of freed extents */
+#define XFS_BMAPI_NODISCARD 0x1000
+
+/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
+#define XFS_BMAPI_NORMAP 0x2000
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -128,7 +135,9 @@ struct xfs_extent_free_item
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_DELALLOC, "DELALLOC" }, \
- { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }
+ { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \
+ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \
+ { XFS_BMAPI_NORMAP, "NORMAP" }
static inline int xfs_bmapi_aflag(int w)
@@ -192,9 +201,9 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
-void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+void __xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
xfs_fsblock_t bno, xfs_filblks_t len,
- struct xfs_owner_info *oinfo);
+ struct xfs_owner_info *oinfo, bool skip_discard);
void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
@@ -240,6 +249,17 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
+static inline void
+xfs_bmap_add_free(
+ struct xfs_mount *mp,
+ struct xfs_defer_ops *dfops,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ struct xfs_owner_info *oinfo)
+{
+ __xfs_bmap_add_free(mp, dfops, bno, len, oinfo, false);
+}
+
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
XFS_BMAP_UNMAP,
@@ -277,4 +297,8 @@ static inline int xfs_bmap_fork_to_state(int whichfork)
xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *irec);
+int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
+ struct xfs_defer_ops *dfops, int flags);
+
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index d89d06bea6e3..ac9d4aeedb09 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -660,3 +660,12 @@ xfs_bmbt_change_owner(
xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
return error;
}
+
+/* Calculate the bmap btree size for some records. */
+unsigned long long
+xfs_bmbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_bmap_dmnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index e4505746ccaa..fb3cd2d9e0f8 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -118,4 +118,7 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
+
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index ac7d66427e42..c825c8182b30 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4836,14 +4836,14 @@ xfs_btree_query_all(
* Calculate the number of blocks needed to store a given number of records
* in a short-format (per-AG metadata) btree.
*/
-xfs_extlen_t
+unsigned long long
xfs_btree_calc_size(
uint *limits,
unsigned long long len)
{
int level;
int maxrecs;
- xfs_extlen_t rval;
+ unsigned long long rval;
maxrecs = limits[0];
for (level = 0, rval = 0; len > 1; level++) {
@@ -4919,3 +4919,24 @@ xfs_btree_has_record(
*exists = false;
return error;
}
+
+/* Are there more records in this btree? */
+bool
+xfs_btree_has_more_records(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+ /* There are still records in this block. */
+ if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block))
+ return true;
+
+ /* There are more record blocks. */
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
+ else
+ return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 9227159a751e..d7911efee6dc 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -482,7 +482,7 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
unsigned int max_recs);
uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len);
-xfs_extlen_t xfs_btree_calc_size(uint *limits, unsigned long long len);
+unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len);
/* return codes */
#define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */
@@ -528,5 +528,6 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low,
union xfs_btree_irec *high, bool *exists);
+bool xfs_btree_has_more_records(struct xfs_btree_cur *cur);
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 087fea02c389..3daf175e2535 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -220,7 +220,7 @@ xfs_defer_trans_abort(
{
struct xfs_defer_pending *dfp;
- trace_xfs_defer_trans_abort(tp->t_mountp, dop);
+ trace_xfs_defer_trans_abort(tp->t_mountp, dop, _RET_IP_);
/* Abort intent items that don't have a done item. */
list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
@@ -253,7 +253,7 @@ xfs_defer_trans_roll(
for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++)
xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]);
- trace_xfs_defer_trans_roll((*tp)->t_mountp, dop);
+ trace_xfs_defer_trans_roll((*tp)->t_mountp, dop, _RET_IP_);
/* Roll the transaction. */
error = xfs_trans_roll(tp);
@@ -352,10 +352,21 @@ xfs_defer_finish(
void *state;
int error = 0;
void (*cleanup_fn)(struct xfs_trans *, void *, int);
+ struct xfs_defer_ops *orig_dop;
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
- trace_xfs_defer_finish((*tp)->t_mountp, dop);
+ trace_xfs_defer_finish((*tp)->t_mountp, dop, _RET_IP_);
+
+ /*
+ * Attach dfops to the transaction during deferred ops processing. This
+ * explicitly causes calls into the allocator to defer AGFL block frees.
+ * Note that this code can go away once all dfops users attach to the
+ * associated tp.
+ */
+ ASSERT(!(*tp)->t_agfl_dfops || ((*tp)->t_agfl_dfops == dop));
+ orig_dop = (*tp)->t_agfl_dfops;
+ (*tp)->t_agfl_dfops = dop;
/* Until we run out of pending work to finish... */
while (xfs_defer_has_unfinished_work(dop)) {
@@ -428,10 +439,11 @@ xfs_defer_finish(
}
out:
+ (*tp)->t_agfl_dfops = orig_dop;
if (error)
trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error);
else
- trace_xfs_defer_finish_done((*tp)->t_mountp, dop);
+ trace_xfs_defer_finish_done((*tp)->t_mountp, dop, _RET_IP_);
return error;
}
@@ -447,7 +459,7 @@ xfs_defer_cancel(
struct list_head *pwi;
struct list_head *n;
- trace_xfs_defer_cancel(NULL, dop);
+ trace_xfs_defer_cancel(NULL, dop, _RET_IP_);
/*
* Free the pending items. Caller should already have arranged
@@ -532,5 +544,5 @@ xfs_defer_init(
*fbp = NULLFSBLOCK;
INIT_LIST_HEAD(&dop->dop_intake);
INIT_LIST_HEAD(&dop->dop_pending);
- trace_xfs_defer_init(NULL, dop);
+ trace_xfs_defer_init(NULL, dop, _RET_IP_);
}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 045beacdd37d..e70725ba1f5f 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -55,6 +55,7 @@ enum xfs_defer_ops_type {
XFS_DEFER_OPS_TYPE_REFCOUNT,
XFS_DEFER_OPS_TYPE_RMAP,
XFS_DEFER_OPS_TYPE_FREE,
+ XFS_DEFER_OPS_TYPE_AGFL_FREE,
XFS_DEFER_OPS_TYPE_MAX,
};
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 8b7a6c3cb599..cce520becee4 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -41,14 +41,18 @@ xfs_calc_dquots_per_chunk(
/*
* Do some primitive error checking on ondisk dquot data structures.
+ *
+ * The xfs_dqblk structure /contains/ the xfs_disk_dquot structure;
+ * we verify them separately because at some points we have only the
+ * smaller xfs_disk_dquot structure available.
*/
+
xfs_failaddr_t
xfs_dquot_verify(
struct xfs_mount *mp,
xfs_disk_dquot_t *ddq,
xfs_dqid_t id,
- uint type, /* used only when IO_dorepair is true */
- uint flags)
+ uint type) /* used only during quotacheck */
{
/*
* We can encounter an uninitialized dquot buffer for 2 reasons:
@@ -70,6 +74,8 @@ xfs_dquot_verify(
if (ddq->d_version != XFS_DQUOT_VERSION)
return __this_address;
+ if (type && ddq->d_flags != type)
+ return __this_address;
if (ddq->d_flags != XFS_DQ_USER &&
ddq->d_flags != XFS_DQ_PROJ &&
ddq->d_flags != XFS_DQ_GROUP)
@@ -99,33 +105,44 @@ xfs_dquot_verify(
return NULL;
}
+xfs_failaddr_t
+xfs_dqblk_verify(
+ struct xfs_mount *mp,
+ struct xfs_dqblk *dqb,
+ xfs_dqid_t id,
+ uint type) /* used only during quotacheck */
+{
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid))
+ return __this_address;
+
+ return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type);
+}
+
/*
* Do some primitive error checking on ondisk dquot data structures.
*/
int
-xfs_dquot_repair(
+xfs_dqblk_repair(
struct xfs_mount *mp,
- struct xfs_disk_dquot *ddq,
+ struct xfs_dqblk *dqb,
xfs_dqid_t id,
uint type)
{
- struct xfs_dqblk *d = (struct xfs_dqblk *)ddq;
-
-
/*
* Typically, a repair is only requested by quotacheck.
*/
ASSERT(id != -1);
- memset(d, 0, sizeof(xfs_dqblk_t));
+ memset(dqb, 0, sizeof(xfs_dqblk_t));
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_flags = type;
- d->dd_diskdq.d_id = cpu_to_be32(id);
+ dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ dqb->dd_diskdq.d_flags = type;
+ dqb->dd_diskdq.d_id = cpu_to_be32(id);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid);
- xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid);
+ xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
}
@@ -135,7 +152,8 @@ xfs_dquot_repair(
STATIC bool
xfs_dquot_buf_verify_crc(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ bool readahead)
{
struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
int ndquots;
@@ -156,10 +174,12 @@ xfs_dquot_buf_verify_crc(
for (i = 0; i < ndquots; i++, d++) {
if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
- XFS_DQUOT_CRC_OFF))
- return false;
- if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid))
+ XFS_DQUOT_CRC_OFF)) {
+ if (!readahead)
+ xfs_buf_verifier_error(bp, -EFSBADCRC, __func__,
+ d, sizeof(*d), __this_address);
return false;
+ }
}
return true;
}
@@ -167,9 +187,10 @@ xfs_dquot_buf_verify_crc(
STATIC xfs_failaddr_t
xfs_dquot_buf_verify(
struct xfs_mount *mp,
- struct xfs_buf *bp)
+ struct xfs_buf *bp,
+ bool readahead)
{
- struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ struct xfs_dqblk *dqb = bp->b_addr;
xfs_failaddr_t fa;
xfs_dqid_t id = 0;
int ndquots;
@@ -195,14 +216,19 @@ xfs_dquot_buf_verify(
for (i = 0; i < ndquots; i++) {
struct xfs_disk_dquot *ddq;
- ddq = &d[i].dd_diskdq;
+ ddq = &dqb[i].dd_diskdq;
if (i == 0)
id = be32_to_cpu(ddq->d_id);
- fa = xfs_dquot_verify(mp, ddq, id + i, 0, 0);
- if (fa)
+ fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0);
+ if (fa) {
+ if (!readahead)
+ xfs_buf_verifier_error(bp, -EFSCORRUPTED,
+ __func__, &dqb[i],
+ sizeof(struct xfs_dqblk), fa);
return fa;
+ }
}
return NULL;
@@ -214,7 +240,7 @@ xfs_dquot_buf_verify_struct(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- return xfs_dquot_buf_verify(mp, bp);
+ return xfs_dquot_buf_verify(mp, bp, false);
}
static void
@@ -222,15 +248,10 @@ xfs_dquot_buf_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_failaddr_t fa;
- if (!xfs_dquot_buf_verify_crc(mp, bp))
- xfs_verifier_error(bp, -EFSBADCRC, __this_address);
- else {
- fa = xfs_dquot_buf_verify(mp, bp);
- if (fa)
- xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
- }
+ if (!xfs_dquot_buf_verify_crc(mp, bp, false))
+ return;
+ xfs_dquot_buf_verify(mp, bp, false);
}
/*
@@ -245,8 +266,8 @@ xfs_dquot_buf_readahead_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if (!xfs_dquot_buf_verify_crc(mp, bp) ||
- xfs_dquot_buf_verify(mp, bp) != NULL) {
+ if (!xfs_dquot_buf_verify_crc(mp, bp, true) ||
+ xfs_dquot_buf_verify(mp, bp, true) != NULL) {
xfs_buf_ioerror(bp, -EIO);
bp->b_flags &= ~XBF_DONE;
}
@@ -262,11 +283,8 @@ xfs_dquot_buf_write_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- xfs_failaddr_t fa;
- fa = xfs_dquot_buf_verify(mp, bp);
- if (fa)
- xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
+ xfs_dquot_buf_verify(mp, bp, false);
}
const struct xfs_buf_ops xfs_dquot_buf_ops = {
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index bc1789d95152..d47b91625945 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -65,7 +65,8 @@
#define XFS_ERRTAG_LOG_BAD_CRC 29
#define XFS_ERRTAG_LOG_ITEM_PIN 30
#define XFS_ERRTAG_BUF_LRU_REF 31
-#define XFS_ERRTAG_MAX 32
+#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32
+#define XFS_ERRTAG_MAX 33
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -102,5 +103,6 @@
#define XFS_RANDOM_LOG_BAD_CRC 1
#define XFS_RANDOM_LOG_ITEM_PIN 1
#define XFS_RANDOM_BUF_LRU_REF 2
+#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 42956d8d95ed..c1cb29a5f4f6 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -98,6 +98,9 @@ struct xfs_ifork;
XFS_SB_VERSION2_PROJID32BIT | \
XFS_SB_VERSION2_FTYPE)
+/* Maximum size of the xfs filesystem label, no terminating NULL */
+#define XFSLABEL_MAX 12
+
/*
* Superblock - in core version. Must match the ondisk version below.
* Must be padded to 64 bit alignment.
@@ -122,7 +125,7 @@ typedef struct xfs_sb {
uint16_t sb_sectsize; /* volume sector size, bytes */
uint16_t sb_inodesize; /* inode size, bytes */
uint16_t sb_inopblock; /* inodes per block */
- char sb_fname[12]; /* file system name */
+ char sb_fname[XFSLABEL_MAX]; /* file system name */
uint8_t sb_blocklog; /* log2 of sb_blocksize */
uint8_t sb_sectlog; /* log2 of sb_sectsize */
uint8_t sb_inodelog; /* log2 of sb_inodesize */
@@ -213,7 +216,7 @@ typedef struct xfs_dsb {
__be16 sb_sectsize; /* volume sector size, bytes */
__be16 sb_inodesize; /* inode size, bytes */
__be16 sb_inopblock; /* inodes per block */
- char sb_fname[12]; /* file system name */
+ char sb_fname[XFSLABEL_MAX]; /* file system name */
__u8 sb_blocklog; /* log2 of sb_blocksize */
__u8 sb_sectlog; /* log2 of sb_sectsize */
__u8 sb_inodelog; /* log2 of sb_inodesize */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index faf1a4edd618..dddc75e4f1f6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -542,13 +542,20 @@ struct xfs_scrub_metadata {
/* o: Metadata object looked funny but isn't corrupt. */
#define XFS_SCRUB_OFLAG_WARNING (1 << 6)
+/*
+ * o: IFLAG_REPAIR was set but metadata object did not need fixing or
+ * optimization and has therefore not been altered.
+ */
+#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7)
+
#define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR)
#define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \
XFS_SCRUB_OFLAG_PREEN | \
XFS_SCRUB_OFLAG_XFAIL | \
XFS_SCRUB_OFLAG_XCORRUPT | \
XFS_SCRUB_OFLAG_INCOMPLETE | \
- XFS_SCRUB_OFLAG_WARNING)
+ XFS_SCRUB_OFLAG_WARNING | \
+ XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED)
#define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT)
/*
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index de627fa19168..4ca4ff7a757d 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -148,7 +148,7 @@ xfs_inobt_get_rec(
/*
* Insert a single inobt record. Cursor must already point to desired location.
*/
-STATIC int
+int
xfs_inobt_insert_rec(
struct xfs_btree_cur *cur,
uint16_t holemask,
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index c5402bb4ce0c..77fffced8bac 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -176,6 +176,9 @@ int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low,
xfs_agino_t high, bool *exists);
int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count,
xfs_agino_t *freecount);
+int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
+ uint8_t count, int32_t freecount, xfs_inofree_t free,
+ int *stat);
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 367e9a0726e6..b04c55512159 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -296,7 +296,7 @@ xfs_inobt_verify(
case cpu_to_be32(XFS_FIBT_MAGIC):
break;
default:
- return NULL;
+ return __this_address;
}
/* level verification */
@@ -608,3 +608,12 @@ xfs_finobt_calc_reserves(
*used += tree_len;
return 0;
}
+
+/* Calculate the inobt btree size for some records. */
+xfs_extlen_t
+xfs_iallocbt_calc_size(
+ struct xfs_mount *mp,
+ unsigned long long len)
+{
+ return xfs_btree_calc_size(mp->m_inobt_mnr, len);
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index aa81e2e63f3f..4acdd5458d59 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -74,5 +74,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
xfs_extlen_t *ask, xfs_extlen_t *used);
+extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
+ unsigned long long len);
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index bb1b13a9b5f4..d4af2804b178 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -107,14 +107,12 @@ typedef uint16_t xfs_qwarncnt_t;
* to a single function. None of these XFS_QMOPT_* flags are meant to have
* persistent values (ie. their values can and will change between versions)
*/
-#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */
#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */
#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */
#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */
#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */
#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */
#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */
-#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */
/*
* flags to xfs_trans_mod_dquot to indicate which field needs to be
@@ -152,10 +150,11 @@ typedef uint16_t xfs_qwarncnt_t;
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp,
- struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type,
- uint flags);
+ struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type);
+extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp,
+ struct xfs_dqblk *dqb, xfs_dqid_t id, uint type);
extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
-extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq,
+extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb,
xfs_dqid_t id, uint type);
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 560e28473024..418d53295893 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -88,8 +88,25 @@ xfs_refcount_lookup_ge(
return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
}
+/*
+ * Look up the first record equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_eq(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ int *stat)
+{
+ trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+ XFS_LOOKUP_LE);
+ cur->bc_rec.rc.rc_startblock = bno;
+ cur->bc_rec.rc.rc_blockcount = 0;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
/* Convert on-disk record to in-core format. */
-static inline void
+void
xfs_refcount_btrec_to_irec(
union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec)
@@ -149,7 +166,7 @@ xfs_refcount_update(
* by [bno, len, refcount].
* This either works (return 0) or gets an EFSCORRUPTED error.
*/
-STATIC int
+int
xfs_refcount_insert(
struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec,
@@ -162,7 +179,10 @@ xfs_refcount_insert(
cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
error = xfs_btree_insert(cur, i);
+ if (error)
+ goto out_error;
XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+
out_error:
if (error)
trace_xfs_refcount_insert_error(cur->bc_mp,
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 2a731ac68fe4..a92ad9078bc1 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -24,6 +24,8 @@ extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
xfs_agblock_t bno, int *stat);
extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur,
+ xfs_agblock_t bno, int *stat);
extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
@@ -85,5 +87,10 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
+union xfs_btree_rec;
+extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec,
+ struct xfs_refcount_irec *irec);
+extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
+ struct xfs_refcount_irec *irec, int *stat);
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index fba8d2718017..c0644f1be8a8 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -1374,6 +1374,8 @@ xfs_rmap_convert_shared(
*/
error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
&PREV, &i);
+ if (error)
+ goto done;
XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
ASSERT(PREV.rm_offset <= offset);
@@ -2030,6 +2032,34 @@ out_error:
return error;
}
+/* Insert a raw rmap into the rmapbt. */
+int
+xfs_rmap_map_raw(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rmap)
+{
+ struct xfs_owner_info oinfo;
+
+ oinfo.oi_owner = rmap->rm_owner;
+ oinfo.oi_offset = rmap->rm_offset;
+ oinfo.oi_flags = 0;
+ if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+
+ if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return xfs_rmap_map(cur, rmap->rm_startblock,
+ rmap->rm_blockcount,
+ rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+ &oinfo);
+
+ return xfs_rmap_map_shared(cur, rmap->rm_startblock,
+ rmap->rm_blockcount,
+ rmap->rm_flags & XFS_RMAP_UNWRITTEN,
+ &oinfo);
+}
+
struct xfs_rmap_query_range_info {
xfs_rmap_query_range_fn fn;
void *priv;
@@ -2453,3 +2483,56 @@ xfs_rmap_record_exists(
irec.rm_startblock + irec.rm_blockcount >= bno + len);
return 0;
}
+
+struct xfs_rmap_key_state {
+ uint64_t owner;
+ uint64_t offset;
+ unsigned int flags;
+ bool has_rmap;
+};
+
+/* For each rmap given, figure out if it doesn't match the key we want. */
+STATIC int
+xfs_rmap_has_other_keys_helper(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_rmap_key_state *rks = priv;
+
+ if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset &&
+ ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags)
+ return 0;
+ rks->has_rmap = true;
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Given an extent and some owner info, can we find records overlapping
+ * the extent whose owner info does not match the given owner?
+ */
+int
+xfs_rmap_has_other_keys(
+ struct xfs_btree_cur *cur,
+ xfs_agblock_t bno,
+ xfs_extlen_t len,
+ struct xfs_owner_info *oinfo,
+ bool *has_rmap)
+{
+ struct xfs_rmap_irec low = {0};
+ struct xfs_rmap_irec high;
+ struct xfs_rmap_key_state rks;
+ int error;
+
+ xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags);
+ rks.has_rmap = false;
+
+ low.rm_startblock = bno;
+ memset(&high, 0xFF, sizeof(high));
+ high.rm_startblock = bno + len - 1;
+
+ error = xfs_rmap_query_range(cur, &low, &high,
+ xfs_rmap_has_other_keys_helper, &rks);
+ *has_rmap = rks.has_rmap;
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 380e53be98d5..43e506f67680 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -238,5 +238,9 @@ int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno,
int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno,
xfs_extlen_t len, struct xfs_owner_info *oinfo,
bool *has_rmap);
+int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+ xfs_extlen_t len, struct xfs_owner_info *oinfo,
+ bool *has_rmap);
+int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap);
#endif /* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 106be2d0bb88..369eeb7a52ec 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -90,6 +90,9 @@ xfs_rtbuf_get(
if (error)
return error;
+ if (nmap == 0 || !xfs_bmap_is_real_extent(&map))
+ return -EFSCORRUPTED;
+
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
@@ -1033,14 +1036,17 @@ xfs_rtalloc_query_range(
int is_free;
int error = 0;
- if (low_rec->ar_startblock > high_rec->ar_startblock)
+ if (low_rec->ar_startext > high_rec->ar_startext)
return -EINVAL;
- else if (low_rec->ar_startblock == high_rec->ar_startblock)
+ if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
+ low_rec->ar_startext == high_rec->ar_startext)
return 0;
+ if (high_rec->ar_startext >= mp->m_sb.sb_rextents)
+ high_rec->ar_startext = mp->m_sb.sb_rextents - 1;
/* Iterate the bitmap, looking for discrepancies. */
- rtstart = low_rec->ar_startblock;
- rem = high_rec->ar_startblock - rtstart;
+ rtstart = low_rec->ar_startext;
+ rem = high_rec->ar_startext - rtstart;
while (rem) {
/* Is the first block free? */
error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
@@ -1050,13 +1056,13 @@ xfs_rtalloc_query_range(
/* How long does the extent go for? */
error = xfs_rtfind_forw(mp, tp, rtstart,
- high_rec->ar_startblock - 1, &rtend);
+ high_rec->ar_startext - 1, &rtend);
if (error)
break;
if (is_free) {
- rec.ar_startblock = rtstart;
- rec.ar_blockcount = rtend - rtstart + 1;
+ rec.ar_startext = rtstart;
+ rec.ar_extcount = rtend - rtstart + 1;
error = fn(tp, &rec, priv);
if (error)
@@ -1079,9 +1085,9 @@ xfs_rtalloc_query_all(
{
struct xfs_rtalloc_rec keys[2];
- keys[0].ar_startblock = 0;
- keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks;
- keys[0].ar_blockcount = keys[1].ar_blockcount = 0;
+ keys[0].ar_startext = 0;
+ keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1;
+ keys[0].ar_extcount = keys[1].ar_extcount = 0;
return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
}
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index d9b94bd5f689..d485e14313c6 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -888,6 +888,109 @@ xfs_sync_sb(
return xfs_trans_commit(tp);
}
+/*
+ * Update all the secondary superblocks to match the new state of the primary.
+ * Because we are completely overwriting all the existing fields in the
+ * secondary superblock buffers, there is no need to read them in from disk.
+ * Just get a new buffer, stamp it and write it.
+ *
+ * The sb buffers need to be cached here so that we serialise against other
+ * operations that access the secondary superblocks, but we don't want to keep
+ * them in memory once it is written so we mark it as a one-shot buffer.
+ */
+int
+xfs_update_secondary_sbs(
+ struct xfs_mount *mp)
+{
+ xfs_agnumber_t agno;
+ int saved_error = 0;
+ int error = 0;
+ LIST_HEAD (buffer_list);
+
+ /* update secondary superblocks. */
+ for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) {
+ struct xfs_buf *bp;
+
+ bp = xfs_buf_get(mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_DADDR),
+ XFS_FSS_TO_BB(mp, 1), 0);
+ /*
+ * If we get an error reading or writing alternate superblocks,
+ * continue. xfs_repair chooses the "best" superblock based
+ * on most matches; if we break early, we'll leave more
+ * superblocks un-updated than updated, and xfs_repair may
+ * pick them over the properly-updated primary.
+ */
+ if (!bp) {
+ xfs_warn(mp,
+ "error allocating secondary superblock for ag %d",
+ agno);
+ if (!saved_error)
+ saved_error = -ENOMEM;
+ continue;
+ }
+
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_oneshot(bp);
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+ xfs_buf_delwri_queue(bp, &buffer_list);
+ xfs_buf_relse(bp);
+
+ /* don't hold too many buffers at once */
+ if (agno % 16)
+ continue;
+
+ error = xfs_buf_delwri_submit(&buffer_list);
+ if (error) {
+ xfs_warn(mp,
+ "write error %d updating a secondary superblock near ag %d",
+ error, agno);
+ if (!saved_error)
+ saved_error = error;
+ continue;
+ }
+ }
+ error = xfs_buf_delwri_submit(&buffer_list);
+ if (error) {
+ xfs_warn(mp,
+ "write error %d updating a secondary superblock near ag %d",
+ error, agno);
+ }
+
+ return saved_error ? saved_error : error;
+}
+
+/*
+ * Same behavior as xfs_sync_sb, except that it is always synchronous and it
+ * also writes the superblock buffer to disk sector 0 immediately.
+ */
+int
+xfs_sync_sb_buf(
+ struct xfs_mount *mp)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_log_sb(tp);
+ xfs_trans_bhold(tp, mp->m_sb_bp);
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out;
+ /*
+ * write out the sb buffer to get the changes to disk
+ */
+ error = xfs_bwrite(mp->m_sb_bp);
+out:
+ xfs_buf_relse(mp->m_sb_bp);
+ return error;
+}
+
int
xfs_fs_geometry(
struct xfs_sb *sbp,
@@ -972,3 +1075,47 @@ xfs_fs_geometry(
return 0;
}
+
+/* Read a secondary superblock. */
+int
+xfs_sb_read_secondary(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(agno != 0 && agno != NULLAGNUMBER);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ if (error)
+ return error;
+ xfs_buf_set_ref(bp, XFS_SSB_REF);
+ *bpp = bp;
+ return 0;
+}
+
+/* Get an uninitialised secondary superblock buffer. */
+int
+xfs_sb_get_secondary(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ struct xfs_buf **bpp)
+{
+ struct xfs_buf *bp;
+
+ ASSERT(agno != 0 && agno != NULLAGNUMBER);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+ XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0);
+ if (!bp)
+ return -ENOMEM;
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_oneshot(bp);
+ *bpp = bp;
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 63dcd2a1a657..244e0162c49e 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -18,6 +18,13 @@
#ifndef __XFS_SB_H__
#define __XFS_SB_H__
+struct xfs_mount;
+struct xfs_sb;
+struct xfs_dsb;
+struct xfs_trans;
+struct xfs_fsop_geom;
+struct xfs_perag;
+
/*
* perag get/put wrappers for ref counting
*/
@@ -29,13 +36,22 @@ extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
extern void xfs_log_sb(struct xfs_trans *tp);
extern int xfs_sync_sb(struct xfs_mount *mp, bool wait);
+extern int xfs_sync_sb_buf(struct xfs_mount *mp);
extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
+extern int xfs_update_secondary_sbs(struct xfs_mount *mp);
+
#define XFS_FS_GEOM_MAX_STRUCT_VER (4)
extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo,
int struct_version);
+extern int xfs_sb_read_secondary(struct xfs_mount *mp,
+ struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **bpp);
+extern int xfs_sb_get_secondary(struct xfs_mount *mp,
+ struct xfs_trans *tp, xfs_agnumber_t agno,
+ struct xfs_buf **bpp);
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index d0b84da0cb1e..ae99c260adb1 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -57,21 +57,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
extern const struct xfs_buf_ops xfs_rtbuf_ops;
-/*
- * This structure is used to track log items associated with
- * a transaction. It points to the log item and keeps some
- * flags to track the state of the log item. It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
- struct xfs_log_item *lid_item;
- struct list_head lid_trans;
- unsigned char lid_flags;
-};
-
-#define XFS_LID_DIRTY 0x1
-
/* log size calculation functions */
int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
int xfs_log_calc_minimum_size(struct xfs_mount *);
@@ -127,6 +112,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_ATTR_BTREE_REF 1
#define XFS_DQUOT_REF 1
#define XFS_REFC_BTREE_REF 1
+#define XFS_SSB_REF 0
/*
* Flags for xfs_trans_ichgtime().
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 3c560695c546..ea18449bd732 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -30,7 +30,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */
typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */
typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */
-typedef int32_t xfs_rtword_t; /* word type for bitmap manipulations */
+typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
typedef int32_t xfs_tid_t; /* transaction identifier */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 018aabbd9394..1f71793f7db4 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -38,68 +38,6 @@
#include "scrub/common.h"
#include "scrub/trace.h"
-/*
- * Walk all the blocks in the AGFL. The fn function can return any negative
- * error code or XFS_BTREE_QUERY_RANGE_ABORT.
- */
-int
-xfs_scrub_walk_agfl(
- struct xfs_scrub_context *sc,
- int (*fn)(struct xfs_scrub_context *,
- xfs_agblock_t bno, void *),
- void *priv)
-{
- struct xfs_agf *agf;
- __be32 *agfl_bno;
- struct xfs_mount *mp = sc->mp;
- unsigned int flfirst;
- unsigned int fllast;
- int i;
- int error;
-
- agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp);
- flfirst = be32_to_cpu(agf->agf_flfirst);
- fllast = be32_to_cpu(agf->agf_fllast);
-
- /* Nothing to walk in an empty AGFL. */
- if (agf->agf_flcount == cpu_to_be32(0))
- return 0;
-
- /* first to last is a consecutive list. */
- if (fllast >= flfirst) {
- for (i = flfirst; i <= fllast; i++) {
- error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
- if (error)
- return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return error;
- }
-
- return 0;
- }
-
- /* first to the end */
- for (i = flfirst; i < xfs_agfl_size(mp); i++) {
- error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
- if (error)
- return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return error;
- }
-
- /* the start to last. */
- for (i = 0; i <= fllast; i++) {
- error = fn(sc, be32_to_cpu(agfl_bno[i]), priv);
- if (error)
- return error;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return error;
- }
-
- return 0;
-}
-
/* Superblock */
/* Cross-reference with the other btrees. */
@@ -157,9 +95,7 @@ xfs_scrub_superblock(
if (agno == 0)
return 0;
- error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+ error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp);
/*
* The superblock verifier can return several different error codes
* if it thinks the superblock doesn't look right. For a mount these
@@ -680,6 +616,7 @@ struct xfs_scrub_agfl_info {
unsigned int sz_entries;
unsigned int nr_entries;
xfs_agblock_t *entries;
+ struct xfs_scrub_context *sc;
};
/* Cross-reference with the other btrees. */
@@ -701,12 +638,12 @@ xfs_scrub_agfl_block_xref(
/* Scrub an AGFL block. */
STATIC int
xfs_scrub_agfl_block(
- struct xfs_scrub_context *sc,
+ struct xfs_mount *mp,
xfs_agblock_t agbno,
void *priv)
{
- struct xfs_mount *mp = sc->mp;
struct xfs_scrub_agfl_info *sai = priv;
+ struct xfs_scrub_context *sc = sai->sc;
xfs_agnumber_t agno = sc->sa.agno;
if (xfs_verify_agbno(mp, agno, agbno) &&
@@ -717,6 +654,9 @@ xfs_scrub_agfl_block(
xfs_scrub_agfl_block_xref(sc, agbno, priv);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return XFS_BTREE_QUERY_RANGE_ABORT;
+
return 0;
}
@@ -796,8 +736,10 @@ xfs_scrub_agfl(
goto out;
}
memset(&sai, 0, sizeof(sai));
+ sai.sc = sc;
sai.sz_entries = agflcount;
- sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS);
+ sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
+ KM_MAYFAIL);
if (!sai.entries) {
error = -ENOMEM;
goto out;
@@ -805,7 +747,12 @@ xfs_scrub_agfl(
/* Check the blocks in the AGFL. */
xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG);
- error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai);
+ error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp),
+ sc->sa.agfl_bp, xfs_scrub_agfl_block, &sai);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+ error = 0;
+ goto out_free;
+ }
if (error)
goto out_free;
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
new file mode 100644
index 000000000000..8b91e9ebe1e7
--- /dev/null
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/* Superblock */
+
+/* Repair the superblock. */
+int
+xfs_repair_superblock(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+ int error;
+
+ /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */
+ agno = sc->sm->sm_agno;
+ if (agno == 0)
+ return -EOPNOTSUPP;
+
+ error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp);
+ if (error)
+ return error;
+
+ /* Copy AG 0's superblock to this one. */
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+
+ /* Write this to disk. */
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+ xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+ return error;
+}
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 517c079d3f68..941a0a55224e 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -70,7 +70,7 @@ xfs_scrub_allocbt_xref_other(
pcur = &sc->sa.cnt_cur;
else
pcur = &sc->sa.bno_cur;
- if (!*pcur)
+ if (!*pcur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec);
@@ -172,7 +172,7 @@ xfs_scrub_xref_is_used_space(
bool is_freesp;
int error;
- if (!sc->sa.bno_cur)
+ if (!sc->sa.bno_cur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp);
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 127575f0abfb..84b6d6b66578 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -126,8 +126,9 @@ xfs_scrub_xattr_listent(
if (args.valuelen != valuelen)
xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK,
args.blkno);
-
fail_xref:
+ if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ context->seen_enough = 1;
return;
}
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 639d14b51e90..eeadb33a701c 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -51,7 +51,6 @@ xfs_scrub_setup_inode_bmap(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- struct xfs_mount *mp = sc->mp;
int error;
error = xfs_scrub_get_inode(sc, ip);
@@ -75,7 +74,7 @@ xfs_scrub_setup_inode_bmap(
}
/* Got the inode, lock it and we're ready to go. */
- error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ error = xfs_scrub_trans_alloc(sc, 0);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
@@ -175,7 +174,7 @@ xfs_scrub_bmap_xref_rmap(
unsigned long long rmap_end;
uint64_t owner;
- if (!info->sc->sa.rmap_cur)
+ if (!info->sc->sa.rmap_cur || xfs_scrub_skip_xref(info->sc->sm))
return;
if (info->whichfork == XFS_COW_FORK)
@@ -684,7 +683,8 @@ xfs_scrub_bmap(
info.lastoff = 0;
ifp = XFS_IFORK_PTR(ip, whichfork);
for_each_xfs_iext(ifp, &icur, &irec) {
- if (xfs_scrub_should_terminate(sc, &error))
+ if (xfs_scrub_should_terminate(sc, &error) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
if (isnullstartblock(irec.br_startblock))
continue;
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 54218168c8f9..2d29dceaa00e 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -442,7 +442,7 @@ xfs_scrub_btree_check_owner(
*/
if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
co = kmem_alloc(sizeof(struct check_owner),
- KM_MAYFAIL | KM_NOFS);
+ KM_MAYFAIL);
if (!co)
return -ENOMEM;
co->level = level;
@@ -455,6 +455,44 @@ xfs_scrub_btree_check_owner(
}
/*
+ * Check that this btree block has at least minrecs records or is one of the
+ * special blocks that don't require that.
+ */
+STATIC void
+xfs_scrub_btree_check_minrecs(
+ struct xfs_scrub_btree *bs,
+ int level,
+ struct xfs_btree_block *block)
+{
+ unsigned int numrecs;
+ int ok_level;
+
+ numrecs = be16_to_cpu(block->bb_numrecs);
+
+ /* More records than minrecs means the block is ok. */
+ if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level))
+ return;
+
+ /*
+ * Certain btree blocks /can/ have fewer than minrecs records. Any
+ * level greater than or equal to the level of the highest dedicated
+ * btree block are allowed to violate this constraint.
+ *
+ * For a btree rooted in a block, the btree root can have fewer than
+ * minrecs records. If the btree is rooted in an inode and does not
+ * store records in the root, the direct children of the root and the
+ * root itself can have fewer than minrecs records.
+ */
+ ok_level = bs->cur->bc_nlevels - 1;
+ if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ ok_level--;
+ if (level >= ok_level)
+ return;
+
+ xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level);
+}
+
+/*
* Grab and scrub a btree block given a btree pointer. Returns block
* and buffer pointers (if applicable) if they're ok to use.
*/
@@ -491,6 +529,8 @@ xfs_scrub_btree_get_block(
if (*pbp)
xfs_scrub_buffer_recheck(bs->sc, *pbp);
+ xfs_scrub_btree_check_minrecs(bs, level, *pblock);
+
/*
* Check the block's owner; this function absorbs error codes
* for us.
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 8ed91d5c868d..41198a5f872c 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -44,11 +44,14 @@
#include "xfs_rmap_btree.h"
#include "xfs_log.h"
#include "xfs_trans_priv.h"
+#include "xfs_attr.h"
+#include "xfs_reflink.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/btree.h"
+#include "scrub/repair.h"
/* Common code for the metadata scrubbers. */
@@ -539,6 +542,10 @@ xfs_scrub_ag_free(
xfs_trans_brelse(sc->tp, sa->agi_bp);
sa->agi_bp = NULL;
}
+ if (sa->pag) {
+ xfs_perag_put(sa->pag);
+ sa->pag = NULL;
+ }
sa->agno = NULLAGNUMBER;
}
@@ -566,15 +573,53 @@ xfs_scrub_ag_init(
return xfs_scrub_ag_btcur_init(sc, sa);
}
+/*
+ * Grab the per-ag structure if we haven't already gotten it. Teardown of the
+ * xfs_scrub_ag will release it for us.
+ */
+void
+xfs_scrub_perag_get(
+ struct xfs_mount *mp,
+ struct xfs_scrub_ag *sa)
+{
+ if (!sa->pag)
+ sa->pag = xfs_perag_get(mp, sa->agno);
+}
+
/* Per-scrubber setup functions */
+/*
+ * Grab an empty transaction so that we can re-grab locked buffers if
+ * one of our btrees turns out to be cyclic.
+ *
+ * If we're going to repair something, we need to ask for the largest possible
+ * log reservation so that we can handle the worst case scenario for metadata
+ * updates while rebuilding a metadata item. We also need to reserve as many
+ * blocks in the head transaction as we think we're going to need to rebuild
+ * the metadata object.
+ */
+int
+xfs_scrub_trans_alloc(
+ struct xfs_scrub_context *sc,
+ uint resblks)
+{
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+ return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
+ resblks, 0, 0, &sc->tp);
+
+ return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
/* Set us up with a transaction and an empty context. */
int
xfs_scrub_setup_fs(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp);
+ uint resblks;
+
+ resblks = xfs_repair_calc_ag_resblks(sc);
+ return xfs_scrub_trans_alloc(sc, resblks);
}
/* Set us up with AG headers and btree cursors. */
@@ -695,7 +740,6 @@ xfs_scrub_setup_inode_contents(
struct xfs_inode *ip,
unsigned int resblks)
{
- struct xfs_mount *mp = sc->mp;
int error;
error = xfs_scrub_get_inode(sc, ip);
@@ -705,7 +749,7 @@ xfs_scrub_setup_inode_contents(
/* Got the inode, lock it and we're ready to go. */
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
- error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ error = xfs_scrub_trans_alloc(sc, resblks);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
@@ -727,6 +771,10 @@ xfs_scrub_should_check_xref(
int *error,
struct xfs_btree_cur **curpp)
{
+ /* No point in xref if we already know we're corrupt. */
+ if (xfs_scrub_skip_xref(sc->sm))
+ return false;
+
if (*error == 0)
return true;
@@ -773,3 +821,80 @@ xfs_scrub_buffer_recheck(
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
trace_xfs_scrub_block_error(sc, bp->b_bn, fa);
}
+
+/*
+ * Scrub the attr/data forks of a metadata inode. The metadata inode must be
+ * pointed to by sc->ip and the ILOCK must be held.
+ */
+int
+xfs_scrub_metadata_inode_forks(
+ struct xfs_scrub_context *sc)
+{
+ __u32 smtype;
+ bool shared;
+ int error;
+
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return 0;
+
+ /* Metadata inodes don't live on the rt device. */
+ if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ /* They should never participate in reflink. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ /* They also should never have extended attributes. */
+ if (xfs_inode_hasattr(sc->ip)) {
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ /* Invoke the data fork scrubber. */
+ smtype = sc->sm->sm_type;
+ sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
+ error = xfs_scrub_bmap_data(sc);
+ sc->sm->sm_type = smtype;
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
+ /* Look for incorrect shared blocks. */
+ if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) {
+ error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip,
+ &shared);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0,
+ &error))
+ return error;
+ if (shared)
+ xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
+ }
+
+ return error;
+}
+
+/*
+ * Try to lock an inode in violation of the usual locking order rules. For
+ * example, trying to get the IOLOCK while in transaction context, or just
+ * plain breaking AG-order or inode-order inode locking rules. Either way,
+ * the only way to avoid an ABBA deadlock is to use trylock and back off if
+ * we can't.
+ */
+int
+xfs_scrub_ilock_inverted(
+ struct xfs_inode *ip,
+ uint lock_mode)
+{
+ int i;
+
+ for (i = 0; i < 20; i++) {
+ if (xfs_ilock_nowait(ip, lock_mode))
+ return 0;
+ delay(1);
+ }
+ return -EDEADLOCK;
+}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index deaf60400981..76bb2d1d808c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -38,19 +38,7 @@ xfs_scrub_should_terminate(
return false;
}
-/*
- * Grab an empty transaction so that we can re-grab locked buffers if
- * one of our btrees turns out to be cyclic.
- */
-static inline int
-xfs_scrub_trans_alloc(
- struct xfs_scrub_metadata *sm,
- struct xfs_mount *mp,
- struct xfs_trans **tpp)
-{
- return xfs_trans_alloc_empty(mp, tpp);
-}
-
+int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks);
bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
xfs_agblock_t bno, int *error);
bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork,
@@ -135,16 +123,13 @@ xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip)
void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa);
int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
struct xfs_scrub_ag *sa);
+void xfs_scrub_perag_get(struct xfs_mount *mp, struct xfs_scrub_ag *sa);
int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
struct xfs_buf **agi, struct xfs_buf **agf,
struct xfs_buf **agfl);
void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa);
int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc,
struct xfs_scrub_ag *sa);
-int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc,
- int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno,
- void *),
- void *priv);
int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc,
struct xfs_btree_cur *cur,
struct xfs_owner_info *oinfo,
@@ -157,4 +142,17 @@ int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc,
struct xfs_inode *ip, unsigned int resblks);
void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp);
+/*
+ * Don't bother cross-referencing if we already found corruption or cross
+ * referencing discrepancies.
+ */
+static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm)
+{
+ return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT);
+}
+
+int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc);
+int xfs_scrub_ilock_inverted(struct xfs_inode *ip, uint lock_mode);
+
#endif /* __XFS_SCRUB_COMMON_H__ */
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 38f29806eb54..1a4309b3e786 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -172,7 +172,7 @@ xfs_scrub_dir_actor(
error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL);
if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset,
&error))
- goto fail_xref;
+ goto out;
if (lookup_ino != ino) {
xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset);
goto out;
@@ -183,8 +183,13 @@ xfs_scrub_dir_actor(
if (error)
goto out;
out:
- return error;
-fail_xref:
+ /*
+ * A negative error code returned here is supposed to cause the
+ * dir_emit caller (xfs_readdir) to abort the directory iteration
+ * and return zero to xfs_scrub_directory.
+ */
+ if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ return -EFSCORRUPTED;
return error;
}
@@ -240,6 +245,9 @@ xfs_scrub_dir_rec(
}
xfs_scrub_buffer_recheck(ds->sc, bp);
+ if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_relse;
+
dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off);
/* Make sure we got a real directory entry. */
@@ -357,6 +365,9 @@ xfs_scrub_directory_data_bestfree(
/* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_buf;
+
/* Do the bestfrees correspond to actual free space? */
bf = d_ops->data_bestfree_p(bp->b_addr);
smallest_bestfree = UINT_MAX;
@@ -413,14 +424,18 @@ xfs_scrub_directory_data_bestfree(
/* Spot check this free entry */
tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
- if (tag != ((char *)dup - (char *)bp->b_addr))
+ if (tag != ((char *)dup - (char *)bp->b_addr)) {
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ goto out_buf;
+ }
/*
* Either this entry is a bestfree or it's smaller than
* any of the bestfrees.
*/
xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out_buf;
/* Move on. */
newlen = be16_to_cpu(dup->length);
@@ -546,6 +561,8 @@ xfs_scrub_directory_leaf1_bestfree(
}
if (leafhdr.stale != stale)
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
/* Check all the bestfree entries. */
for (i = 0; i < bestcount; i++, bestp++) {
@@ -556,9 +573,11 @@ xfs_scrub_directory_leaf1_bestfree(
i * args->geo->fsbcount, -1, &dbp);
if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
- continue;
+ break;
xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
}
out:
return error;
@@ -607,7 +626,7 @@ xfs_scrub_directory_free_bestfree(
-1, &dbp);
if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk,
&error))
- continue;
+ break;
xfs_scrub_directory_check_freesp(sc, lblk, dbp, best);
xfs_trans_brelse(sc->tp, dbp);
}
@@ -656,7 +675,7 @@ xfs_scrub_directory_blocks(
/* Iterate all the data extents in the directory... */
found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
- while (found) {
+ while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
/* Block directories only have a single block at offset 0. */
if (is_block &&
(got.br_startoff > 0 ||
@@ -719,7 +738,7 @@ xfs_scrub_directory_blocks(
/* Scan for free blocks */
lblk = free_lblk;
found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
- while (found) {
+ while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
/*
* Dirs can't have blocks mapped above 2^32.
* Single-block dirs shouldn't even be here.
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index 106ca4bd753f..00a834d3b56d 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -387,7 +387,8 @@ xfs_scrub_iallocbt_xref_rmap_btreeblks(
int error;
if (!sc->sa.ino_cur || !sc->sa.rmap_cur ||
- (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur))
+ (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) ||
+ xfs_scrub_skip_xref(sc->sm))
return;
/* Check that we saw as many inobt blocks as the rmap says. */
@@ -424,7 +425,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes(
xfs_filblks_t blocks;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
/* Check that we saw as many inode blocks as the rmap knows about. */
@@ -496,7 +497,7 @@ xfs_scrub_xref_inode_check(
bool has_inodes;
int error;
- if (!(*icur))
+ if (!(*icur) || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes);
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index df14930e4fc5..0c696f7018de 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -55,7 +55,6 @@ xfs_scrub_setup_inode(
struct xfs_scrub_context *sc,
struct xfs_inode *ip)
{
- struct xfs_mount *mp = sc->mp;
int error;
/*
@@ -68,7 +67,7 @@ xfs_scrub_setup_inode(
break;
case -EFSCORRUPTED:
case -EFSBADCRC:
- return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ return xfs_scrub_trans_alloc(sc, 0);
default:
return error;
}
@@ -76,7 +75,7 @@ xfs_scrub_setup_inode(
/* Got the inode, lock it and we're ready to go. */
sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
xfs_ilock(sc->ip, sc->ilock_flags);
- error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp);
+ error = xfs_scrub_trans_alloc(sc, 0);
if (error)
goto out;
sc->ilock_flags |= XFS_ILOCK_EXCL;
@@ -449,7 +448,7 @@ xfs_scrub_inode_xref_finobt(
int has_record;
int error;
- if (!sc->sa.fino_cur)
+ if (!sc->sa.fino_cur || xfs_scrub_skip_xref(sc->sm))
return;
agino = XFS_INO_TO_AGINO(sc->mp, ino);
@@ -492,6 +491,9 @@ xfs_scrub_inode_xref_bmap(
xfs_filblks_t acount;
int error;
+ if (xfs_scrub_skip_xref(sc->sm))
+ return;
+
/* Walk all the extents to check nextents/naextents/nblocks. */
error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
&nextents, &count);
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index 1fb88c18d455..77c6b22c6bfd 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -147,6 +147,9 @@ xfs_scrub_parent_validate(
*try_again = false;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
/* '..' must not point to ourselves. */
if (sc->ip->i_ino == dnum) {
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
@@ -211,7 +214,9 @@ xfs_scrub_parent_validate(
*/
xfs_iunlock(sc->ip, sc->ilock_flags);
sc->ilock_flags = 0;
- xfs_ilock(dp, XFS_IOLOCK_SHARED);
+ error = xfs_scrub_ilock_inverted(dp, XFS_IOLOCK_SHARED);
+ if (error)
+ goto out_rele;
/* Go looking for our dentry. */
error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink);
@@ -220,8 +225,10 @@ xfs_scrub_parent_validate(
/* Drop the parent lock, relock this inode. */
xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+ error = xfs_scrub_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL);
+ if (error)
+ goto out_rele;
sc->ilock_flags = XFS_IOLOCK_EXCL;
- xfs_ilock(sc->ip, sc->ilock_flags);
/*
* If we're an unlinked directory, the parent /won't/ have a link
@@ -323,5 +330,13 @@ xfs_scrub_parent(
if (try_again && tries == 20)
xfs_scrub_set_incomplete(sc);
out:
+ /*
+ * If we failed to lock the parent inode even after a retry, just mark
+ * this scrub incomplete and return.
+ */
+ if (sc->try_harder && error == -EDEADLOCK) {
+ error = 0;
+ xfs_scrub_set_incomplete(sc);
+ }
return error;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 6ba465e6c885..15ae4d23d6ac 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -66,25 +66,43 @@ xfs_scrub_setup_quota(
struct xfs_inode *ip)
{
uint dqtype;
+ int error;
+
+ if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp))
+ return -ENOENT;
dqtype = xfs_scrub_quota_to_dqtype(sc);
if (dqtype == 0)
return -EINVAL;
+ sc->has_quotaofflock = true;
+ mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
if (!xfs_this_quota_on(sc->mp, dqtype))
return -ENOENT;
+ error = xfs_scrub_setup_fs(sc, ip);
+ if (error)
+ return error;
+ sc->ip = xfs_quota_inode(sc->mp, dqtype);
+ xfs_ilock(sc->ip, XFS_ILOCK_EXCL);
+ sc->ilock_flags = XFS_ILOCK_EXCL;
return 0;
}
/* Quotas. */
+struct xfs_scrub_quota_info {
+ struct xfs_scrub_context *sc;
+ xfs_dqid_t last_id;
+};
+
/* Scrub the fields in an individual quota item. */
-STATIC void
+STATIC int
xfs_scrub_quota_item(
- struct xfs_scrub_context *sc,
- uint dqtype,
struct xfs_dquot *dq,
- xfs_dqid_t id)
+ uint dqtype,
+ void *priv)
{
+ struct xfs_scrub_quota_info *sqi = priv;
+ struct xfs_scrub_context *sc = sqi->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_disk_dquot *d = &dq->q_core;
struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -99,17 +117,18 @@ xfs_scrub_quota_item(
unsigned long long icount;
unsigned long long rcount;
xfs_ino_t fs_icount;
-
- offset = id / qi->qi_dqperchunk;
+ xfs_dqid_t id = be32_to_cpu(d->d_id);
/*
- * We fed $id and DQNEXT into the xfs_qm_dqget call, which means
- * that the actual dquot we got must either have the same id or
- * the next higher id.
+ * Except for the root dquot, the actual dquot we got must either have
+ * the same or higher id as we saw before.
*/
- if (id > be32_to_cpu(d->d_id))
+ offset = id / qi->qi_dqperchunk;
+ if (id && id <= sqi->last_id)
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ sqi->last_id = id;
+
/* Did we get the dquot type we wanted? */
if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES))
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
@@ -183,115 +202,85 @@ xfs_scrub_quota_item(
xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
if (id != 0 && rhard != 0 && rcount > rhard)
xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+
+ return 0;
}
-/* Scrub all of a quota type's items. */
-int
-xfs_scrub_quota(
+/* Check the quota's data fork. */
+STATIC int
+xfs_scrub_quota_data_fork(
struct xfs_scrub_context *sc)
{
struct xfs_bmbt_irec irec = { 0 };
- struct xfs_mount *mp = sc->mp;
- struct xfs_inode *ip;
- struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct xfs_dquot *dq;
+ struct xfs_iext_cursor icur;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp;
xfs_fileoff_t max_dqid_off;
- xfs_fileoff_t off = 0;
- xfs_dqid_t id = 0;
- uint dqtype;
- int nimaps;
int error = 0;
- if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
- return -ENOENT;
-
- mutex_lock(&qi->qi_quotaofflock);
- dqtype = xfs_scrub_quota_to_dqtype(sc);
- if (!xfs_this_quota_on(sc->mp, dqtype)) {
- error = -ENOENT;
- goto out_unlock_quota;
- }
-
- /* Attach to the quota inode and set sc->ip so that reporting works. */
- ip = xfs_quota_inode(sc->mp, dqtype);
- sc->ip = ip;
+ /* Invoke the fork scrubber. */
+ error = xfs_scrub_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
- /* Look for problem extents. */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) {
- xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino);
- goto out_unlock_inode;
- }
+ /* Check for data fork problems that apply only to quota files. */
max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
- while (1) {
+ ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK);
+ for_each_xfs_iext(ifp, &icur, &irec) {
if (xfs_scrub_should_terminate(sc, &error))
break;
-
- off = irec.br_startoff + irec.br_blockcount;
- nimaps = 1;
- error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps,
- XFS_BMAPI_ENTIRE);
- if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off,
- &error))
- goto out_unlock_inode;
- if (!nimaps)
- break;
- if (irec.br_startblock == HOLESTARTBLOCK)
- continue;
-
- /* Check the extent record doesn't point to crap. */
- if (irec.br_startblock + irec.br_blockcount <=
- irec.br_startblock)
- xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
- irec.br_startoff);
- if (!xfs_verify_fsbno(mp, irec.br_startblock) ||
- !xfs_verify_fsbno(mp, irec.br_startblock +
- irec.br_blockcount - 1))
- xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
- irec.br_startoff);
-
/*
- * Unwritten extents or blocks mapped above the highest
+ * delalloc extents or blocks mapped above the highest
* quota id shouldn't happen.
*/
if (isnullstartblock(irec.br_startblock) ||
irec.br_startoff > max_dqid_off ||
- irec.br_startoff + irec.br_blockcount > max_dqid_off + 1)
- xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+ xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK,
+ irec.br_startoff);
+ break;
+ }
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- goto out;
- /* Check all the quota items. */
- while (id < ((xfs_dqid_t)-1ULL)) {
- if (xfs_scrub_should_terminate(sc, &error))
- break;
+ return error;
+}
- error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT,
- &dq);
- if (error == -ENOENT)
- break;
- if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
- id * qi->qi_dqperchunk, &error))
- break;
+/* Scrub all of a quota type's items. */
+int
+xfs_scrub_quota(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_scrub_quota_info sqi;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ uint dqtype;
+ int error = 0;
- xfs_scrub_quota_item(sc, dqtype, dq, id);
+ dqtype = xfs_scrub_quota_to_dqtype(sc);
- id = be32_to_cpu(dq->q_core.d_id) + 1;
- xfs_qm_dqput(dq);
- if (!id)
- break;
- }
+ /* Look for problem extents. */
+ error = xfs_scrub_quota_data_fork(sc);
+ if (error)
+ goto out;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+ goto out;
+
+ /*
+ * Check all the quota items. Now that we've checked the quota inode
+ * data fork we have to drop ILOCK_EXCL to use the regular dquot
+ * functions.
+ */
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = 0;
+ sqi.sc = sc;
+ sqi.last_id = 0;
+ error = xfs_qm_dqiterate(mp, dqtype, xfs_scrub_quota_item, &sqi);
+ sc->ilock_flags = XFS_ILOCK_EXCL;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+ if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK,
+ sqi.last_id * qi->qi_dqperchunk, &error))
+ goto out;
out:
- /* We set sc->ip earlier, so make sure we clear it now. */
- sc->ip = NULL;
-out_unlock_quota:
- mutex_unlock(&qi->qi_quotaofflock);
return error;
-
-out_unlock_inode:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto out;
}
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 400f1561cd3d..324a5f159145 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -150,7 +150,7 @@ xfs_scrub_refcountbt_rmap_check(
* so we don't need insertion sort here.
*/
frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag),
- KM_MAYFAIL | KM_NOFS);
+ KM_MAYFAIL);
if (!frag)
return -ENOMEM;
memcpy(&frag->rm, rec, sizeof(frag->rm));
@@ -310,7 +310,7 @@ xfs_scrub_refcountbt_xref_rmap(
struct xfs_scrub_refcnt_frag *n;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
/* Cross-reference with the rmapbt to confirm the refcount. */
@@ -404,7 +404,7 @@ xfs_scrub_refcount_xref_rmap(
xfs_filblks_t blocks;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
/* Check that we saw as many refcbt blocks as the rmap knows about. */
@@ -460,7 +460,7 @@ xfs_scrub_xref_is_cow_staging(
int has_refcount;
int error;
- if (!sc->sa.refc_cur)
+ if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
return;
/* Find the CoW staging extent. */
@@ -504,7 +504,7 @@ xfs_scrub_xref_is_not_shared(
bool shared;
int error;
- if (!sc->sa.refc_cur)
+ if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared);
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
new file mode 100644
index 000000000000..e3e8fba1c99c
--- /dev/null
+++ b/fs/xfs/scrub/repair.c
@@ -0,0 +1,1089 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_extent_busy.h"
+#include "xfs_ag_resv.h"
+#include "xfs_trans_space.h"
+#include "xfs_quota.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Attempt to repair some metadata, if the metadata is corrupt and userspace
+ * told us to fix it. This function returns -EAGAIN to mean "re-run scrub",
+ * and will set *fixed to true if it thinks it repaired anything.
+ */
+int
+xfs_repair_attempt(
+ struct xfs_inode *ip,
+ struct xfs_scrub_context *sc,
+ bool *fixed)
+{
+ int error = 0;
+
+ trace_xfs_repair_attempt(ip, sc->sm, error);
+
+ xfs_scrub_ag_btcur_free(&sc->sa);
+
+ /* Repair whatever's broken. */
+ ASSERT(sc->ops->repair);
+ error = sc->ops->repair(sc);
+ trace_xfs_repair_done(ip, sc->sm, error);
+ switch (error) {
+ case 0:
+ /*
+ * Repair succeeded. Commit the fixes and perform a second
+ * scrub so that we can tell userspace if we fixed the problem.
+ */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ *fixed = true;
+ return -EAGAIN;
+ case -EDEADLOCK:
+ case -EAGAIN:
+ /* Tell the caller to try again having grabbed all the locks. */
+ if (!sc->try_harder) {
+ sc->try_harder = true;
+ return -EAGAIN;
+ }
+ /*
+ * We tried harder but still couldn't grab all the resources
+ * we needed to fix it. The corruption has not been fixed,
+ * so report back to userspace.
+ */
+ return -EFSCORRUPTED;
+ default:
+ return error;
+ }
+}
+
+/*
+ * Complain about unfixable problems in the filesystem. We don't log
+ * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
+ * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
+ * administrator isn't running xfs_scrub in no-repairs mode.
+ *
+ * Use this helper function because _ratelimited silently declares a static
+ * structure to track rate limiting information.
+ */
+void
+xfs_repair_failure(
+ struct xfs_mount *mp)
+{
+ xfs_alert_ratelimited(mp,
+"Corruption not fixed during online repair. Unmount and run xfs_repair.");
+}
+
+/*
+ * Repair probe -- userspace uses this to probe if we're willing to repair a
+ * given mountpoint.
+ */
+int
+xfs_repair_probe(
+ struct xfs_scrub_context *sc)
+{
+ int error = 0;
+
+ if (xfs_scrub_should_terminate(sc, &error))
+ return error;
+
+ return 0;
+}
+
+/*
+ * Roll a transaction, keeping the AG headers locked and reinitializing
+ * the btree cursors.
+ */
+int
+xfs_repair_roll_ag_trans(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ /* Keep the AG header buffers locked so we can keep going. */
+ xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+
+ /* Roll the transaction. */
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ goto out_release;
+
+ /* Join AG headers to the new transaction. */
+ xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
+
+ return 0;
+
+out_release:
+ /*
+ * Rolling failed, so release the hold on the buffers. The
+ * buffers will be released during teardown on our way out
+ * of the kernel.
+ */
+ xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
+ xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
+ xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
+
+ return error;
+}
+
+/*
+ * Does the given AG have enough space to rebuild a btree? Neither AG
+ * reservation can be critical, and we must have enough space (factoring
+ * in AG reservations) to construct a whole btree.
+ */
+bool
+xfs_repair_ag_has_space(
+ struct xfs_perag *pag,
+ xfs_extlen_t nr_blocks,
+ enum xfs_ag_resv_type type)
+{
+ return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
+ !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
+ pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
+}
+
+/*
+ * Figure out how many blocks to reserve for an AG repair. We calculate the
+ * worst case estimate for the number of blocks we'd need to rebuild one of
+ * any type of per-AG btree.
+ */
+xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+ struct xfs_scrub_context *sc)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_scrub_metadata *sm = sc->sm;
+ struct xfs_perag *pag;
+ struct xfs_buf *bp;
+ xfs_agino_t icount = 0;
+ xfs_extlen_t aglen = 0;
+ xfs_extlen_t usedlen;
+ xfs_extlen_t freelen;
+ xfs_extlen_t bnobt_sz;
+ xfs_extlen_t inobt_sz;
+ xfs_extlen_t rmapbt_sz;
+ xfs_extlen_t refcbt_sz;
+ int error;
+
+ if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ return 0;
+
+ /* Use in-core counters if possible. */
+ pag = xfs_perag_get(mp, sm->sm_agno);
+ if (pag->pagi_init)
+ icount = pag->pagi_count;
+
+ /*
+ * Otherwise try to get the actual counters from disk; if not, make
+ * some worst case assumptions.
+ */
+ if (icount == 0) {
+ error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
+ if (error) {
+ icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
+ } else {
+ icount = pag->pagi_count;
+ xfs_buf_relse(bp);
+ }
+ }
+
+ /* Now grab the block counters from the AGF. */
+ error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
+ if (error) {
+ aglen = mp->m_sb.sb_agblocks;
+ freelen = aglen;
+ usedlen = aglen;
+ } else {
+ aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
+ freelen = pag->pagf_freeblks;
+ usedlen = aglen - freelen;
+ xfs_buf_relse(bp);
+ }
+ xfs_perag_put(pag);
+
+ trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
+ freelen, usedlen);
+
+ /*
+ * Figure out how many blocks we'd need worst case to rebuild
+ * each type of btree. Note that we can only rebuild the
+ * bnobt/cntbt or inobt/finobt as pairs.
+ */
+ bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
+ if (xfs_sb_version_hassparseinodes(&mp->m_sb))
+ inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+ XFS_INODES_PER_HOLEMASK_BIT);
+ else
+ inobt_sz = xfs_iallocbt_calc_size(mp, icount /
+ XFS_INODES_PER_CHUNK);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ inobt_sz *= 2;
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
+ else
+ refcbt_sz = 0;
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ /*
+ * Guess how many blocks we need to rebuild the rmapbt.
+ * For non-reflink filesystems we can't have more records than
+ * used blocks. However, with reflink it's possible to have
+ * more than one rmap record per AG block. We don't know how
+ * many rmaps there could be in the AG, so we start off with
+ * what we hope is an generous over-estimation.
+ */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ rmapbt_sz = xfs_rmapbt_calc_size(mp,
+ (unsigned long long)aglen * 2);
+ else
+ rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
+ } else {
+ rmapbt_sz = 0;
+ }
+
+ trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
+ inobt_sz, rmapbt_sz, refcbt_sz);
+
+ return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
+}
+
+/* Allocate a block in an AG. */
+int
+xfs_repair_alloc_ag_block(
+ struct xfs_scrub_context *sc,
+ struct xfs_owner_info *oinfo,
+ xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv)
+{
+ struct xfs_alloc_arg args = {0};
+ xfs_agblock_t bno;
+ int error;
+
+ switch (resv) {
+ case XFS_AG_RESV_AGFL:
+ case XFS_AG_RESV_RMAPBT:
+ error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
+ if (error)
+ return error;
+ if (bno == NULLAGBLOCK)
+ return -ENOSPC;
+ xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
+ 1, false);
+ *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
+ if (resv == XFS_AG_RESV_RMAPBT)
+ xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
+ return 0;
+ default:
+ break;
+ }
+
+ args.tp = sc->tp;
+ args.mp = sc->mp;
+ args.oinfo = *oinfo;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
+ args.minlen = 1;
+ args.maxlen = 1;
+ args.prod = 1;
+ args.type = XFS_ALLOCTYPE_THIS_AG;
+ args.resv = resv;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+ ASSERT(args.len == 1);
+ *fsbno = args.fsbno;
+
+ return 0;
+}
+
+/* Initialize a new AG btree root block with zero entries. */
+int
+xfs_repair_init_btblock(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsb,
+ struct xfs_buf **bpp,
+ xfs_btnum_t btnum,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_buf *bp;
+
+ trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
+ XFS_FSB_TO_AGBNO(mp, fsb), btnum);
+
+ ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
+ XFS_FSB_TO_BB(mp, 1), 0);
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
+ xfs_trans_log_buf(tp, bp, 0, bp->b_length);
+ bp->b_ops = ops;
+ *bpp = bp;
+
+ return 0;
+}
+
+/*
+ * Reconstructing per-AG Btrees
+ *
+ * When a space btree is corrupt, we don't bother trying to fix it. Instead,
+ * we scan secondary space metadata to derive the records that should be in
+ * the damaged btree, initialize a fresh btree root, and insert the records.
+ * Note that for rebuilding the rmapbt we scan all the primary data to
+ * generate the new records.
+ *
+ * However, that leaves the matter of removing all the metadata describing the
+ * old broken structure. For primary metadata we use the rmap data to collect
+ * every extent with a matching rmap owner (exlist); we then iterate all other
+ * metadata structures with the same rmap owner to collect the extents that
+ * cannot be removed (sublist). We then subtract sublist from exlist to
+ * derive the blocks that were used by the old btree. These blocks can be
+ * reaped.
+ *
+ * For rmapbt reconstructions we must use different tactics for extent
+ * collection. First we iterate all primary metadata (this excludes the old
+ * rmapbt, obviously) to generate new rmap records. The gaps in the rmap
+ * records are collected as exlist. The bnobt records are collected as
+ * sublist. As with the other btrees we subtract sublist from exlist, and the
+ * result (since the rmapbt lives in the free space) are the blocks from the
+ * old rmapbt.
+ */
+
+/* Collect a dead btree extent for later disposal. */
+int
+xfs_repair_collect_btree_extent(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ xfs_fsblock_t fsbno,
+ xfs_extlen_t len)
+{
+ struct xfs_repair_extent *rex;
+
+ trace_xfs_repair_collect_btree_extent(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
+
+ rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
+ if (!rex)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&rex->list);
+ rex->fsbno = fsbno;
+ rex->len = len;
+ list_add_tail(&rex->list, &exlist->list);
+
+ return 0;
+}
+
+/*
+ * An error happened during the rebuild so the transaction will be cancelled.
+ * The fs will shut down, and the administrator has to unmount and run repair.
+ * Therefore, free all the memory associated with the list so we can die.
+ */
+void
+xfs_repair_cancel_btree_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ list_del(&rex->list);
+ kmem_free(rex);
+ }
+}
+
+/* Compare two btree extents. */
+static int
+xfs_repair_btree_extent_cmp(
+ void *priv,
+ struct list_head *a,
+ struct list_head *b)
+{
+ struct xfs_repair_extent *ap;
+ struct xfs_repair_extent *bp;
+
+ ap = container_of(a, struct xfs_repair_extent, list);
+ bp = container_of(b, struct xfs_repair_extent, list);
+
+ if (ap->fsbno > bp->fsbno)
+ return 1;
+ if (ap->fsbno < bp->fsbno)
+ return -1;
+ return 0;
+}
+
+/*
+ * Remove all the blocks mentioned in @sublist from the extents in @exlist.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @exlist; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sublist. This routine subtracts all the extents
+ * mentioned in sublist from all the extents linked in @exlist, which leaves
+ * @exlist as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure. The blocks mentioned in
+ * @exlist can be reaped.
+ */
+#define LEFT_ALIGNED (1 << 0)
+#define RIGHT_ALIGNED (1 << 1)
+int
+xfs_repair_subtract_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_repair_extent_list *sublist)
+{
+ struct list_head *lp;
+ struct xfs_repair_extent *ex;
+ struct xfs_repair_extent *newex;
+ struct xfs_repair_extent *subex;
+ xfs_fsblock_t sub_fsb;
+ xfs_extlen_t sub_len;
+ int state;
+ int error = 0;
+
+ if (list_empty(&exlist->list) || list_empty(&sublist->list))
+ return 0;
+ ASSERT(!list_empty(&sublist->list));
+
+ list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
+ list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
+
+ /*
+ * Now that we've sorted both lists, we iterate exlist once, rolling
+ * forward through sublist and/or exlist as necessary until we find an
+ * overlap or reach the end of either list. We do not reset lp to the
+ * head of exlist nor do we reset subex to the head of sublist. The
+ * list traversal is similar to merge sort, but we're deleting
+ * instead. In this manner we avoid O(n^2) operations.
+ */
+ subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
+ list);
+ lp = exlist->list.next;
+ while (lp != &exlist->list) {
+ ex = list_entry(lp, struct xfs_repair_extent, list);
+
+ /*
+ * Advance subex and/or ex until we find a pair that
+ * intersect or we run out of extents.
+ */
+ while (subex->fsbno + subex->len <= ex->fsbno) {
+ if (list_is_last(&subex->list, &sublist->list))
+ goto out;
+ subex = list_next_entry(subex, list);
+ }
+ if (subex->fsbno >= ex->fsbno + ex->len) {
+ lp = lp->next;
+ continue;
+ }
+
+ /* trim subex to fit the extent we have */
+ sub_fsb = subex->fsbno;
+ sub_len = subex->len;
+ if (subex->fsbno < ex->fsbno) {
+ sub_len -= ex->fsbno - subex->fsbno;
+ sub_fsb = ex->fsbno;
+ }
+ if (sub_len > ex->len)
+ sub_len = ex->len;
+
+ state = 0;
+ if (sub_fsb == ex->fsbno)
+ state |= LEFT_ALIGNED;
+ if (sub_fsb + sub_len == ex->fsbno + ex->len)
+ state |= RIGHT_ALIGNED;
+ switch (state) {
+ case LEFT_ALIGNED:
+ /* Coincides with only the left. */
+ ex->fsbno += sub_len;
+ ex->len -= sub_len;
+ break;
+ case RIGHT_ALIGNED:
+ /* Coincides with only the right. */
+ ex->len -= sub_len;
+ lp = lp->next;
+ break;
+ case LEFT_ALIGNED | RIGHT_ALIGNED:
+ /* Total overlap, just delete ex. */
+ lp = lp->next;
+ list_del(&ex->list);
+ kmem_free(ex);
+ break;
+ case 0:
+ /*
+ * Deleting from the middle: add the new right extent
+ * and then shrink the left extent.
+ */
+ newex = kmem_alloc(sizeof(struct xfs_repair_extent),
+ KM_MAYFAIL);
+ if (!newex) {
+ error = -ENOMEM;
+ goto out;
+ }
+ INIT_LIST_HEAD(&newex->list);
+ newex->fsbno = sub_fsb + sub_len;
+ newex->len = ex->fsbno + ex->len - newex->fsbno;
+ list_add(&newex->list, &ex->list);
+ ex->len = sub_fsb - ex->fsbno;
+ lp = lp->next;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ }
+
+out:
+ return error;
+}
+#undef LEFT_ALIGNED
+#undef RIGHT_ALIGNED
+
+/*
+ * Disposal of Blocks from Old per-AG Btrees
+ *
+ * Now that we've constructed a new btree to replace the damaged one, we want
+ * to dispose of the blocks that (we think) the old btree was using.
+ * Previously, we used the rmapbt to collect the extents (exlist) with the
+ * rmap owner corresponding to the tree we rebuilt, collected extents for any
+ * blocks with the same rmap owner that are owned by another data structure
+ * (sublist), and subtracted sublist from exlist. In theory the extents
+ * remaining in exlist are the old btree's blocks.
+ *
+ * Unfortunately, it's possible that the btree was crosslinked with other
+ * blocks on disk. The rmap data can tell us if there are multiple owners, so
+ * if the rmapbt says there is an owner of this block other than @oinfo, then
+ * the block is crosslinked. Remove the reverse mapping and continue.
+ *
+ * If there is one rmap record, we can free the block, which removes the
+ * reverse mapping but doesn't add the block to the free space. Our repair
+ * strategy is to hope the other metadata objects crosslinked on this block
+ * will be rebuilt (atop different blocks), thereby removing all the cross
+ * links.
+ *
+ * If there are no rmap records at all, we also free the block. If the btree
+ * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
+ * supposed to be a rmap record and everything is ok. For other btrees there
+ * had to have been an rmap entry for the block to have ended up on @exlist,
+ * so if it's gone now there's something wrong and the fs will shut down.
+ *
+ * Note: If there are multiple rmap records with only the same rmap owner as
+ * the btree we're trying to rebuild and the block is indeed owned by another
+ * data structure with the same rmap owner, then the block will be in sublist
+ * and therefore doesn't need disposal. If there are multiple rmap records
+ * with only the same rmap owner but the block is not owned by something with
+ * the same rmap owner, the block will be freed.
+ *
+ * The caller is responsible for locking the AG headers for the entire rebuild
+ * operation so that nothing else can sneak in and change the AG state while
+ * we're not looking. We also assume that the caller already invalidated any
+ * buffers associated with @exlist.
+ */
+
+/*
+ * Invalidate buffers for per-AG btree blocks we're dumping. This function
+ * is not intended for use with file data repairs; we have bunmapi for that.
+ */
+int
+xfs_repair_invalidate_blocks(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t i;
+
+ /*
+ * For each block in each extent, see if there's an incore buffer for
+ * exactly that block; if so, invalidate it. The buffer cache only
+ * lets us look for one buffer at a time, so we have to look one block
+ * at a time. Avoid invalidating AG headers and post-EOFS blocks
+ * because we never own those; and if we can't TRYLOCK the buffer we
+ * assume it's owned by someone else.
+ */
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) {
+ /* Skip AG headers and post-EOFS blocks */
+ if (!xfs_verify_fsbno(sc->mp, fsbno))
+ continue;
+ bp = xfs_buf_incore(sc->mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
+ if (bp) {
+ xfs_trans_bjoin(sc->tp, bp);
+ xfs_trans_binval(sc->tp, bp);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* Ensure the freelist is the correct size. */
+int
+xfs_repair_fix_freelist(
+ struct xfs_scrub_context *sc,
+ bool can_shrink)
+{
+ struct xfs_alloc_arg args = {0};
+
+ args.mp = sc->mp;
+ args.tp = sc->tp;
+ args.agno = sc->sa.agno;
+ args.alignment = 1;
+ args.pag = sc->sa.pag;
+
+ return xfs_alloc_fix_freelist(&args,
+ can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+}
+
+/*
+ * Put a block back on the AGFL.
+ */
+STATIC int
+xfs_repair_put_freelist(
+ struct xfs_scrub_context *sc,
+ xfs_agblock_t agbno)
+{
+ struct xfs_owner_info oinfo;
+ int error;
+
+ /* Make sure there's space on the freelist. */
+ error = xfs_repair_fix_freelist(sc, true);
+ if (error)
+ return error;
+
+ /*
+ * Since we're "freeing" a lost block onto the AGFL, we have to
+ * create an rmap for the block prior to merging it or else other
+ * parts will break.
+ */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
+ &oinfo);
+ if (error)
+ return error;
+
+ /* Put the block on the AGFL. */
+ error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
+ agbno, 0);
+ if (error)
+ return error;
+ xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
+ XFS_EXTENT_BUSY_SKIP_DISCARD);
+
+ return 0;
+}
+
+/* Dispose of a single metadata block. */
+STATIC int
+xfs_repair_dispose_btree_block(
+ struct xfs_scrub_context *sc,
+ xfs_fsblock_t fsbno,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type resv)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf_bp = NULL;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ bool has_other_rmap;
+ int error;
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+
+ /*
+ * If we are repairing per-inode metadata, we need to read in the AGF
+ * buffer. Otherwise, we're repairing a per-AG structure, so reuse
+ * the AGF buffer that the setup functions already grabbed.
+ */
+ if (sc->ip) {
+ error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
+ if (error)
+ return error;
+ if (!agf_bp)
+ return -ENOMEM;
+ } else {
+ agf_bp = sc->sa.agf_bp;
+ }
+ cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
+
+ /* Can we find any other rmappings? */
+ error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
+ if (error)
+ goto out_cur;
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ /*
+ * If there are other rmappings, this block is cross linked and must
+ * not be freed. Remove the reverse mapping and move on. Otherwise,
+ * we were the only owner of the block, so free the extent, which will
+ * also remove the rmap.
+ *
+ * XXX: XFS doesn't support detecting the case where a single block
+ * metadata structure is crosslinked with a multi-block structure
+ * because the buffer cache doesn't detect aliasing problems, so we
+ * can't fix 100% of crosslinking problems (yet). The verifiers will
+ * blow on writeout, the filesystem will shut down, and the admin gets
+ * to run xfs_repair.
+ */
+ if (has_other_rmap)
+ error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
+ else if (resv == XFS_AG_RESV_AGFL)
+ error = xfs_repair_put_freelist(sc, agbno);
+ else
+ error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
+ if (agf_bp != sc->sa.agf_bp)
+ xfs_trans_brelse(sc->tp, agf_bp);
+ if (error)
+ return error;
+
+ if (sc->ip)
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+ return xfs_repair_roll_ag_trans(sc);
+
+out_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ if (agf_bp != sc->sa.agf_bp)
+ xfs_trans_brelse(sc->tp, agf_bp);
+ return error;
+}
+
+/* Dispose of btree blocks from an old per-AG btree. */
+int
+xfs_repair_reap_btree_extents(
+ struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type)
+{
+ struct xfs_repair_extent *rex;
+ struct xfs_repair_extent *n;
+ int error = 0;
+
+ ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
+
+ /* Dispose of every block from the old btree. */
+ for_each_xfs_repair_extent_safe(rex, n, exlist) {
+ ASSERT(sc->ip != NULL ||
+ XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno);
+
+ trace_xfs_repair_dispose_btree_extent(sc->mp,
+ XFS_FSB_TO_AGNO(sc->mp, rex->fsbno),
+ XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len);
+
+ for (; rex->len > 0; rex->len--, rex->fsbno++) {
+ error = xfs_repair_dispose_btree_block(sc, rex->fsbno,
+ oinfo, type);
+ if (error)
+ goto out;
+ }
+ list_del(&rex->list);
+ kmem_free(rex);
+ }
+
+out:
+ xfs_repair_cancel_btree_extents(sc, exlist);
+ return error;
+}
+
+/*
+ * Finding per-AG Btree Roots for AGF/AGI Reconstruction
+ *
+ * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
+ * the AG headers by using the rmap data to rummage through the AG looking for
+ * btree roots. This is not guaranteed to work if the AG is heavily damaged
+ * or the rmap data are corrupt.
+ *
+ * Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL
+ * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
+ * AGI is being rebuilt. It must maintain these locks until it's safe for
+ * other threads to change the btrees' shapes. The caller provides
+ * information about the btrees to look for by passing in an array of
+ * xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
+ * The (root, height) fields will be set on return if anything is found. The
+ * last element of the array should have a NULL buf_ops to mark the end of the
+ * array.
+ *
+ * For every rmapbt record matching any of the rmap owners in btree_info,
+ * read each block referenced by the rmap record. If the block is a btree
+ * block from this filesystem matching any of the magic numbers and has a
+ * level higher than what we've already seen, remember the block and the
+ * height of the tree required to have such a block. When the call completes,
+ * we return the highest block we've found for each btree description; those
+ * should be the roots.
+ */
+
+struct xfs_repair_findroot {
+ struct xfs_scrub_context *sc;
+ struct xfs_buf *agfl_bp;
+ struct xfs_agf *agf;
+ struct xfs_repair_find_ag_btree *btree_info;
+};
+
+/* See if our block is in the AGFL. */
+STATIC int
+xfs_repair_findroot_agfl_walk(
+ struct xfs_mount *mp,
+ xfs_agblock_t bno,
+ void *priv)
+{
+ xfs_agblock_t *agbno = priv;
+
+ return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
+}
+
+/* Does this block match the btree information passed in? */
+STATIC int
+xfs_repair_findroot_block(
+ struct xfs_repair_findroot *ri,
+ struct xfs_repair_find_ag_btree *fab,
+ uint64_t owner,
+ xfs_agblock_t agbno,
+ bool *found_it)
+{
+ struct xfs_mount *mp = ri->sc->mp;
+ struct xfs_buf *bp;
+ struct xfs_btree_block *btblock;
+ xfs_daddr_t daddr;
+ int error;
+
+ daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
+
+ /*
+ * Blocks in the AGFL have stale contents that might just happen to
+ * have a matching magic and uuid. We don't want to pull these blocks
+ * in as part of a tree root, so we have to filter out the AGFL stuff
+ * here. If the AGFL looks insane we'll just refuse to repair.
+ */
+ if (owner == XFS_RMAP_OWN_AG) {
+ error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
+ xfs_repair_findroot_agfl_walk, &agbno);
+ if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+ return 0;
+ if (error)
+ return error;
+ }
+
+ error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
+ mp->m_bsize, 0, &bp, NULL);
+ if (error)
+ return error;
+
+ /*
+ * Does this look like a block matching our fs and higher than any
+ * other block we've found so far? If so, reattach buffer verifiers
+ * so the AIL won't complain if the buffer is also dirty.
+ */
+ btblock = XFS_BUF_TO_BLOCK(bp);
+ if (be32_to_cpu(btblock->bb_magic) != fab->magic)
+ goto out;
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+ goto out;
+ bp->b_ops = fab->buf_ops;
+
+ /* Ignore this block if it's lower in the tree than we've seen. */
+ if (fab->root != NULLAGBLOCK &&
+ xfs_btree_get_level(btblock) < fab->height)
+ goto out;
+
+ /* Make sure we pass the verifiers. */
+ bp->b_ops->verify_read(bp);
+ if (bp->b_error)
+ goto out;
+ fab->root = agbno;
+ fab->height = xfs_btree_get_level(btblock) + 1;
+ *found_it = true;
+
+ trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
+ be32_to_cpu(btblock->bb_magic), fab->height - 1);
+out:
+ xfs_trans_brelse(ri->sc->tp, bp);
+ return error;
+}
+
+/*
+ * Do any of the blocks in this rmap record match one of the btrees we're
+ * looking for?
+ */
+STATIC int
+xfs_repair_findroot_rmap(
+ struct xfs_btree_cur *cur,
+ struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xfs_repair_findroot *ri = priv;
+ struct xfs_repair_find_ag_btree *fab;
+ xfs_agblock_t b;
+ bool found_it;
+ int error = 0;
+
+ /* Ignore anything that isn't AG metadata. */
+ if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
+ return 0;
+
+ /* Otherwise scan each block + btree type. */
+ for (b = 0; b < rec->rm_blockcount; b++) {
+ found_it = false;
+ for (fab = ri->btree_info; fab->buf_ops; fab++) {
+ if (rec->rm_owner != fab->rmap_owner)
+ continue;
+ error = xfs_repair_findroot_block(ri, fab,
+ rec->rm_owner, rec->rm_startblock + b,
+ &found_it);
+ if (error)
+ return error;
+ if (found_it)
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/* Find the roots of the per-AG btrees described in btree_info. */
+int
+xfs_repair_find_ag_btree_roots(
+ struct xfs_scrub_context *sc,
+ struct xfs_buf *agf_bp,
+ struct xfs_repair_find_ag_btree *btree_info,
+ struct xfs_buf *agfl_bp)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_repair_findroot ri;
+ struct xfs_repair_find_ag_btree *fab;
+ struct xfs_btree_cur *cur;
+ int error;
+
+ ASSERT(xfs_buf_islocked(agf_bp));
+ ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
+
+ ri.sc = sc;
+ ri.btree_info = btree_info;
+ ri.agf = XFS_BUF_TO_AGF(agf_bp);
+ ri.agfl_bp = agfl_bp;
+ for (fab = btree_info; fab->buf_ops; fab++) {
+ ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
+ ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
+ fab->root = NULLAGBLOCK;
+ fab->height = 0;
+ }
+
+ cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
+ error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri);
+ xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+ return error;
+}
+
+/* Force a quotacheck the next time we mount. */
+void
+xfs_repair_force_quotacheck(
+ struct xfs_scrub_context *sc,
+ uint dqtype)
+{
+ uint flag;
+
+ flag = xfs_quota_chkd_flag(dqtype);
+ if (!(flag & sc->mp->m_qflags))
+ return;
+
+ sc->mp->m_qflags &= ~flag;
+ spin_lock(&sc->mp->m_sb_lock);
+ sc->mp->m_sb.sb_qflags &= ~flag;
+ spin_unlock(&sc->mp->m_sb_lock);
+ xfs_log_sb(sc->tp);
+}
+
+/*
+ * Attach dquots to this inode, or schedule quotacheck to fix them.
+ *
+ * This function ensures that the appropriate dquots are attached to an inode.
+ * We cannot allow the dquot code to allocate an on-disk dquot block here
+ * because we're already in transaction context with the inode locked. The
+ * on-disk dquot should already exist anyway. If the quota code signals
+ * corruption or missing quota information, schedule quotacheck, which will
+ * repair corruptions in the quota metadata.
+ */
+int
+xfs_repair_ino_dqattach(
+ struct xfs_scrub_context *sc)
+{
+ int error;
+
+ error = xfs_qm_dqattach_locked(sc->ip, false);
+ switch (error) {
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ case -ENOENT:
+ xfs_err_ratelimited(sc->mp,
+"inode %llu repair encountered quota error %d, quotacheck forced.",
+ (unsigned long long)sc->ip->i_ino, error);
+ if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
+ if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
+ if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
+ xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
+ /* fall through */
+ case -ESRCH:
+ error = 0;
+ break;
+ default:
+ break;
+ }
+
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
new file mode 100644
index 000000000000..f2b0895294db
--- /dev/null
+++ b/fs/xfs/scrub/repair.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (C) 2018 Oracle. All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+#ifndef __XFS_SCRUB_REPAIR_H__
+#define __XFS_SCRUB_REPAIR_H__
+
+static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc)
+{
+ return -EOPNOTSUPP;
+}
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+
+/* Repair helpers */
+
+int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc,
+ bool *fixed);
+void xfs_repair_failure(struct xfs_mount *mp);
+int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc);
+bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
+ enum xfs_ag_resv_type type);
+xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc);
+int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc,
+ struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno,
+ enum xfs_ag_resv_type resv);
+int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb,
+ struct xfs_buf **bpp, xfs_btnum_t btnum,
+ const struct xfs_buf_ops *ops);
+
+struct xfs_repair_extent {
+ struct list_head list;
+ xfs_fsblock_t fsbno;
+ xfs_extlen_t len;
+};
+
+struct xfs_repair_extent_list {
+ struct list_head list;
+};
+
+static inline void
+xfs_repair_init_extent_list(
+ struct xfs_repair_extent_list *exlist)
+{
+ INIT_LIST_HEAD(&exlist->list);
+}
+
+#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \
+ list_for_each_entry_safe((rbe), (n), &(exlist)->list, list)
+int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno,
+ xfs_extlen_t len);
+void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist);
+int xfs_repair_subtract_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_repair_extent_list *sublist);
+int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink);
+int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *btlist);
+int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc,
+ struct xfs_repair_extent_list *exlist,
+ struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+
+struct xfs_repair_find_ag_btree {
+ /* in: rmap owner of the btree we're looking for */
+ uint64_t rmap_owner;
+
+ /* in: buffer ops */
+ const struct xfs_buf_ops *buf_ops;
+
+ /* in: magic number of the btree */
+ uint32_t magic;
+
+ /* out: the highest btree block found and the tree height */
+ xfs_agblock_t root;
+ unsigned int height;
+};
+
+int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc,
+ struct xfs_buf *agf_bp,
+ struct xfs_repair_find_ag_btree *btree_info,
+ struct xfs_buf *agfl_bp);
+void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype);
+int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc);
+
+/* Metadata repairers */
+
+int xfs_repair_probe(struct xfs_scrub_context *sc);
+int xfs_repair_superblock(struct xfs_scrub_context *sc);
+
+#else
+
+static inline int xfs_repair_attempt(
+ struct xfs_inode *ip,
+ struct xfs_scrub_context *sc,
+ bool *fixed)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void xfs_repair_failure(struct xfs_mount *mp) {}
+
+static inline xfs_extlen_t
+xfs_repair_calc_ag_resblks(
+ struct xfs_scrub_context *sc)
+{
+ ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR));
+ return 0;
+}
+
+#define xfs_repair_probe xfs_repair_notsupported
+#define xfs_repair_superblock xfs_repair_notsupported
+
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_REPAIR_H__ */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index 8f2a7c3ff455..b376a9a77c04 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -66,7 +66,7 @@ xfs_scrub_rmapbt_xref_refc(
bool is_unwritten;
int error;
- if (!sc->sa.refc_cur)
+ if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm))
return;
non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner);
@@ -207,7 +207,7 @@ xfs_scrub_xref_check_owner(
bool has_rmap;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo,
@@ -250,7 +250,7 @@ xfs_scrub_xref_has_no_owner(
bool has_rmap;
int error;
- if (!sc->sa.rmap_cur)
+ if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm))
return;
error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap);
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 39c41dfe08ee..40f462a11ea5 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -66,11 +66,15 @@ xfs_scrub_rtbitmap_rec(
void *priv)
{
struct xfs_scrub_context *sc = priv;
+ xfs_rtblock_t startblock;
+ xfs_rtblock_t blockcount;
- if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock ||
- !xfs_verify_rtbno(sc->mp, rec->ar_startblock) ||
- !xfs_verify_rtbno(sc->mp, rec->ar_startblock +
- rec->ar_blockcount - 1))
+ startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize;
+ blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize;
+
+ if (startblock + blockcount <= startblock ||
+ !xfs_verify_rtbno(sc->mp, startblock) ||
+ !xfs_verify_rtbno(sc->mp, startblock + blockcount - 1))
xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
return 0;
}
@@ -82,6 +86,11 @@ xfs_scrub_rtbitmap(
{
int error;
+ /* Invoke the fork scrubber. */
+ error = xfs_scrub_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc);
if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
goto out;
@@ -95,8 +104,35 @@ int
xfs_scrub_rtsummary(
struct xfs_scrub_context *sc)
{
+ struct xfs_inode *rsumip = sc->mp->m_rsumip;
+ struct xfs_inode *old_ip = sc->ip;
+ uint old_ilock_flags = sc->ilock_flags;
+ int error = 0;
+
+ /*
+ * We ILOCK'd the rt bitmap ip in the setup routine, now lock the
+ * rt summary ip in compliance with the rt inode locking rules.
+ *
+ * Since we switch sc->ip to rsumip we have to save the old ilock
+ * flags so that we don't mix up the inode state that @sc tracks.
+ */
+ sc->ip = rsumip;
+ sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM;
+ xfs_ilock(sc->ip, sc->ilock_flags);
+
+ /* Invoke the fork scrubber. */
+ error = xfs_scrub_metadata_inode_forks(sc);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ goto out;
+
/* XXX: implement this some day */
- return -ENOENT;
+ xfs_scrub_set_incomplete(sc);
+out:
+ /* Switch back to the rtbitmap inode and lock flags. */
+ xfs_iunlock(sc->ip, sc->ilock_flags);
+ sc->ilock_flags = old_ilock_flags;
+ sc->ip = old_ip;
+ return error;
}
@@ -107,11 +143,23 @@ xfs_scrub_xref_is_used_rt_space(
xfs_rtblock_t fsbno,
xfs_extlen_t len)
{
+ xfs_rtblock_t startext;
+ xfs_rtblock_t endext;
+ xfs_rtblock_t extcount;
bool is_free;
int error;
+ if (xfs_scrub_skip_xref(sc->sm))
+ return;
+
+ startext = fsbno;
+ endext = fsbno + len - 1;
+ do_div(startext, sc->mp->m_sb.sb_rextsize);
+ if (do_div(endext, sc->mp->m_sb.sb_rextsize))
+ endext++;
+ extcount = endext - startext;
xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
- error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len,
+ error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount,
&is_free);
if (!xfs_scrub_should_check_xref(sc, &error, NULL))
goto out_unlock;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 26c75967a072..36db098ba583 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -42,11 +42,18 @@
#include "xfs_refcount_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/btree.h"
+#include "scrub/repair.h"
/*
* Online Scrub and Repair
@@ -120,6 +127,24 @@
* XCORRUPT flag; btree query function errors are noted by setting the
* XFAIL flag and deleting the cursor to prevent further attempts to
* cross-reference with a defective btree.
+ *
+ * If a piece of metadata proves corrupt or suboptimal, the userspace
+ * program can ask the kernel to apply some tender loving care (TLC) to
+ * the metadata object by setting the REPAIR flag and re-calling the
+ * scrub ioctl. "Corruption" is defined by metadata violating the
+ * on-disk specification; operations cannot continue if the violation is
+ * left untreated. It is possible for XFS to continue if an object is
+ * "suboptimal", however performance may be degraded. Repairs are
+ * usually performed by rebuilding the metadata entirely out of
+ * redundant metadata. Optimizing, on the other hand, can sometimes be
+ * done without rebuilding entire structures.
+ *
+ * Generally speaking, the repair code has the following code structure:
+ * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock.
+ * The first check helps us figure out if we need to rebuild or simply
+ * optimize the structure so that the rebuild knows what to do. The
+ * second check evaluates the completeness of the repair; that is what
+ * is reported to userspace.
*/
/*
@@ -155,7 +180,10 @@ xfs_scrub_teardown(
{
xfs_scrub_ag_free(sc, &sc->sa);
if (sc->tp) {
- xfs_trans_cancel(sc->tp);
+ if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
+ error = xfs_trans_commit(sc->tp);
+ else
+ xfs_trans_cancel(sc->tp);
sc->tp = NULL;
}
if (sc->ip) {
@@ -166,6 +194,8 @@ xfs_scrub_teardown(
iput(VFS_I(sc->ip));
sc->ip = NULL;
}
+ if (sc->has_quotaofflock)
+ mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
if (sc->buf) {
kmem_free(sc->buf);
sc->buf = NULL;
@@ -180,126 +210,150 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
.type = ST_NONE,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_probe,
+ .repair = xfs_repair_probe,
},
[XFS_SCRUB_TYPE_SB] = { /* superblock */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_superblock,
+ .repair = xfs_repair_superblock,
},
[XFS_SCRUB_TYPE_AGF] = { /* agf */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agf,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_AGFL]= { /* agfl */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agfl,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_AGI] = { /* agi */
.type = ST_PERAG,
.setup = xfs_scrub_setup_fs,
.scrub = xfs_scrub_agi,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_allocbt,
.scrub = xfs_scrub_bnobt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_allocbt,
.scrub = xfs_scrub_cntbt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_INOBT] = { /* inobt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_iallocbt,
.scrub = xfs_scrub_inobt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_iallocbt,
.scrub = xfs_scrub_finobt,
.has = xfs_sb_version_hasfinobt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_rmapbt,
.scrub = xfs_scrub_rmapbt,
.has = xfs_sb_version_hasrmapbt,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */
.type = ST_PERAG,
.setup = xfs_scrub_setup_ag_refcountbt,
.scrub = xfs_scrub_refcountbt,
.has = xfs_sb_version_hasreflink,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode,
.scrub = xfs_scrub_inode,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode_bmap,
.scrub = xfs_scrub_bmap_data,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode_bmap,
.scrub = xfs_scrub_bmap_attr,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
.type = ST_INODE,
.setup = xfs_scrub_setup_inode_bmap,
.scrub = xfs_scrub_bmap_cow,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_DIR] = { /* directory */
.type = ST_INODE,
.setup = xfs_scrub_setup_directory,
.scrub = xfs_scrub_directory,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */
.type = ST_INODE,
.setup = xfs_scrub_setup_xattr,
.scrub = xfs_scrub_xattr,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */
.type = ST_INODE,
.setup = xfs_scrub_setup_symlink,
.scrub = xfs_scrub_symlink,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */
.type = ST_INODE,
.setup = xfs_scrub_setup_parent,
.scrub = xfs_scrub_parent,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_FS,
.setup = xfs_scrub_setup_rt,
.scrub = xfs_scrub_rtbitmap,
.has = xfs_sb_version_hasrealtime,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
.setup = xfs_scrub_setup_rt,
.scrub = xfs_scrub_rtsummary,
.has = xfs_sb_version_hasrealtime,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
+ .repair = xfs_repair_notsupported,
},
[XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
.type = ST_FS,
.setup = xfs_scrub_setup_quota,
.scrub = xfs_scrub_quota,
+ .repair = xfs_repair_notsupported,
},
};
@@ -379,15 +433,54 @@ xfs_scrub_validate_inputs(
if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
goto out;
- /* We don't know how to repair anything yet. */
- if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
- goto out;
+ /*
+ * We only want to repair read-write v5+ filesystems. Defer the check
+ * for ops->repair until after our scrub confirms that we need to
+ * perform repairs so that we avoid failing due to not supporting
+ * repairing an object that doesn't need repairs.
+ */
+ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) {
+ error = -EOPNOTSUPP;
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ goto out;
+
+ error = -EROFS;
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ goto out;
+ }
error = 0;
out:
return error;
}
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc)
+{
+ /*
+ * Userspace asked us to repair something, we repaired it, rescanned
+ * it, and the rescan says it's still broken. Scream about this in
+ * the system logs.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT)))
+ xfs_repair_failure(sc->mp);
+}
+#else
+static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc)
+{
+ /*
+ * Userspace asked us to scrub something, it's broken, and we have no
+ * way of fixing it. Scream in the logs.
+ */
+ if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT))
+ xfs_alert_ratelimited(sc->mp,
+ "Corruption detected during scrub.");
+}
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
/* Dispatch metadata scrubbing. */
int
xfs_scrub_metadata(
@@ -397,6 +490,7 @@ xfs_scrub_metadata(
struct xfs_scrub_context sc;
struct xfs_mount *mp = ip->i_mount;
bool try_harder = false;
+ bool already_fixed = false;
int error = 0;
BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
@@ -446,10 +540,44 @@ retry_op:
} else if (error)
goto out_teardown;
- if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
- XFS_SCRUB_OFLAG_XCORRUPT))
- xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
+ if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) {
+ bool needs_fix;
+
+ /* Let debug users force us into the repair routines. */
+ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+
+ needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT |
+ XFS_SCRUB_OFLAG_PREEN));
+ /*
+ * If userspace asked for a repair but it wasn't necessary,
+ * report that back to userspace.
+ */
+ if (!needs_fix) {
+ sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
+ goto out_nofix;
+ }
+
+ /*
+ * If it's broken, userspace wants us to fix it, and we haven't
+ * already tried to fix it, then attempt a repair.
+ */
+ error = xfs_repair_attempt(ip, &sc, &already_fixed);
+ if (error == -EAGAIN) {
+ if (sc.try_harder)
+ try_harder = true;
+ error = xfs_scrub_teardown(&sc, ip, 0);
+ if (error) {
+ xfs_repair_failure(mp);
+ goto out;
+ }
+ goto retry_op;
+ }
+ }
+out_nofix:
+ xfs_scrub_postmortem(&sc);
out_teardown:
error = xfs_scrub_teardown(&sc, ip, error);
out:
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 0d92af86f67a..636424d5e2ee 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -38,6 +38,9 @@ struct xfs_scrub_meta_ops {
/* Examine metadata for errors. */
int (*scrub)(struct xfs_scrub_context *);
+ /* Repair or optimize the metadata. */
+ int (*repair)(struct xfs_scrub_context *);
+
/* Decide if we even have this piece of metadata. */
bool (*has)(struct xfs_sb *);
@@ -48,6 +51,7 @@ struct xfs_scrub_meta_ops {
/* Buffer pointers and btree cursors for an entire AG. */
struct xfs_scrub_ag {
xfs_agnumber_t agno;
+ struct xfs_perag *pag;
/* AG btree roots */
struct xfs_buf *agf_bp;
@@ -73,6 +77,7 @@ struct xfs_scrub_context {
void *buf;
uint ilock_flags;
bool try_harder;
+ bool has_quotaofflock;
/* State tracking for single-AG operations. */
struct xfs_scrub_ag sa;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 5d2b1c241be5..794d56bb1af8 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -69,6 +69,8 @@ DEFINE_EVENT(xfs_scrub_class, name, \
DEFINE_SCRUB_EVENT(xfs_scrub_start);
DEFINE_SCRUB_EVENT(xfs_scrub_done);
DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry);
+DEFINE_SCRUB_EVENT(xfs_repair_attempt);
+DEFINE_SCRUB_EVENT(xfs_repair_done);
TRACE_EVENT(xfs_scrub_op_error,
TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno,
@@ -492,6 +494,262 @@ TRACE_EVENT(xfs_scrub_xref_error,
__entry->ret_ip)
);
+/* repair tracepoints */
+#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
+
+DECLARE_EVENT_CLASS(xfs_repair_extent_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len),
+ TP_ARGS(mp, agno, agbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len)
+);
+#define DEFINE_REPAIR_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_repair_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len), \
+ TP_ARGS(mp, agno, agbno, len))
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_dispose_btree_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_collect_btree_extent);
+DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_agfl_insert);
+
+DECLARE_EVENT_CLASS(xfs_repair_rmap_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ uint64_t owner, uint64_t offset, unsigned int flags),
+ TP_ARGS(mp, agno, agbno, len, owner, offset, flags),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(uint64_t, owner)
+ __field(uint64_t, offset)
+ __field(unsigned int, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = owner;
+ __entry->offset = offset;
+ __entry->flags = flags;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->len,
+ __entry->owner,
+ __entry->offset,
+ __entry->flags)
+);
+#define DEFINE_REPAIR_RMAP_EVENT(name) \
+DEFINE_EVENT(xfs_repair_rmap_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, \
+ uint64_t owner, uint64_t offset, unsigned int flags), \
+ TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_alloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_ialloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_rmap_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xfs_repair_bmap_extent_fn);
+
+TRACE_EVENT(xfs_repair_refcount_extent_fn,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ struct xfs_refcount_irec *irec),
+ TP_ARGS(mp, agno, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ __field(xfs_nlink_t, refcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startblock = irec->rc_startblock;
+ __entry->blockcount = irec->rc_blockcount;
+ __entry->refcount = irec->rc_refcount;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->refcount)
+)
+
+TRACE_EVENT(xfs_repair_init_btblock,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+ xfs_btnum_t btnum),
+ TP_ARGS(mp, agno, agbno, btnum),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(uint32_t, btnum)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->btnum = btnum;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u btnum %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->btnum)
+)
+TRACE_EVENT(xfs_repair_findroot_block,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+ uint32_t magic, uint16_t level),
+ TP_ARGS(mp, agno, agbno, magic, level),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(uint32_t, magic)
+ __field(uint16_t, level)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->agbno = agbno;
+ __entry->magic = magic;
+ __entry->level = level;
+ ),
+ TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->magic,
+ __entry->level)
+)
+TRACE_EVENT(xfs_repair_calc_ag_resblks,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen,
+ xfs_agblock_t usedlen),
+ TP_ARGS(mp, agno, icount, aglen, freelen, usedlen),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, icount)
+ __field(xfs_agblock_t, aglen)
+ __field(xfs_agblock_t, freelen)
+ __field(xfs_agblock_t, usedlen)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->icount = icount;
+ __entry->aglen = aglen;
+ __entry->freelen = freelen;
+ __entry->usedlen = usedlen;
+ ),
+ TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->icount,
+ __entry->aglen,
+ __entry->freelen,
+ __entry->usedlen)
+)
+TRACE_EVENT(xfs_repair_calc_ag_resblks_btsize,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz,
+ xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz),
+ TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, bnobt_sz)
+ __field(xfs_agblock_t, inobt_sz)
+ __field(xfs_agblock_t, rmapbt_sz)
+ __field(xfs_agblock_t, refcbt_sz)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->bnobt_sz = bnobt_sz;
+ __entry->inobt_sz = inobt_sz;
+ __entry->rmapbt_sz = rmapbt_sz;
+ __entry->refcbt_sz = refcbt_sz;
+ ),
+ TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->bnobt_sz,
+ __entry->inobt_sz,
+ __entry->rmapbt_sz,
+ __entry->refcbt_sz)
+)
+TRACE_EVENT(xfs_repair_reset_counters,
+ TP_PROTO(struct xfs_mount *mp),
+ TP_ARGS(mp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ ),
+ TP_printk("dev %d:%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev))
+)
+
+TRACE_EVENT(xfs_repair_ialloc_insert,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agino_t startino, uint16_t holemask, uint8_t count,
+ uint8_t freecount, uint64_t freemask),
+ TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(uint16_t, holemask)
+ __field(uint8_t, count)
+ __field(uint8_t, freecount)
+ __field(uint64_t, freemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = startino;
+ __entry->holemask = holemask;
+ __entry->count = count;
+ __entry->freecount = freecount;
+ __entry->freemask = freemask;
+ ),
+ TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->holemask,
+ __entry->count,
+ __entry->freecount,
+ __entry->freemask)
+)
+
+#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
+
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 102463543db3..ca6903726689 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1378,10 +1378,9 @@ xfs_vm_bmap(
struct address_space *mapping,
sector_t block)
{
- struct inode *inode = (struct inode *)mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(mapping->host);
- trace_xfs_vm_bmap(XFS_I(inode));
+ trace_xfs_vm_bmap(ip);
/*
* The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1394,9 +1393,7 @@ xfs_vm_bmap(
*/
if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
return 0;
-
- filemap_write_and_wait(mapping);
- return generic_block_bmap(mapping, block, xfs_get_blocks);
+ return iomap_bmap(mapping, block, &xfs_iomap_ops);
}
STATIC int
@@ -1475,6 +1472,16 @@ xfs_vm_set_page_dirty(
return newly_dirty;
}
+static int
+xfs_iomap_swapfile_activate(
+ struct swap_info_struct *sis,
+ struct file *swap_file,
+ sector_t *span)
+{
+ sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
+ return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
+}
+
const struct address_space_operations xfs_address_space_operations = {
.readpage = xfs_vm_readpage,
.readpages = xfs_vm_readpages,
@@ -1488,6 +1495,7 @@ const struct address_space_operations xfs_address_space_operations = {
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
+ .swap_activate = xfs_iomap_swapfile_activate,
};
const struct address_space_operations xfs_dax_aops = {
@@ -1495,4 +1503,5 @@ const struct address_space_operations xfs_dax_aops = {
.direct_IO = noop_direct_IO,
.set_page_dirty = noop_set_page_dirty,
.invalidatepage = noop_invalidatepage,
+ .swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 2203465e63ea..618bb71535c8 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -160,7 +160,7 @@ STATIC void
xfs_bui_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
xfs_bui_release(BUI_ITEM(lip));
}
@@ -305,7 +305,7 @@ xfs_bud_item_unlock(
{
struct xfs_bud_log_item *budp = BUD_ITEM(lip);
- if (lip->li_flags & XFS_LI_ABORTED) {
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
xfs_bui_release(budp->bud_buip);
kmem_zone_free(xfs_bud_zone, budp);
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 8cd8c412f52d..06badcbadeb4 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -848,7 +848,7 @@ xfs_free_eofblocks(
/*
* Attach the dquots to the inode up front.
*/
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -871,8 +871,8 @@ xfs_free_eofblocks(
* contents of the file are flushed to disk then the files
* may be full of holes (ie NULL files bug).
*/
- error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
- XFS_ISIZE(ip));
+ error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK,
+ XFS_ISIZE(ip), XFS_BMAPI_NODISCARD);
if (error) {
/*
* If we get an error at this point we simply don't
@@ -918,7 +918,7 @@ xfs_alloc_file_space(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -1169,7 +1169,7 @@ xfs_free_file_space(
trace_xfs_free_file_space(ip);
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 55661cbdb51b..5179ab9e3d6a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -549,17 +549,31 @@ xfs_buf_hash_destroy(
}
/*
- * Look up, and creates if absent, a lockable buffer for
- * a given range of an inode. The buffer is returned
- * locked. No I/O is implied by this call.
+ * Look up a buffer in the buffer cache and return it referenced and locked
+ * in @found_bp.
+ *
+ * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
+ * cache.
+ *
+ * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
+ * -EAGAIN if we fail to lock it.
+ *
+ * Return values are:
+ * -EFSCORRUPTED if have been supplied with an invalid address
+ * -EAGAIN on trylock failure
+ * -ENOENT if we fail to find a match and @new_bp was NULL
+ * 0, with @found_bp:
+ * - @new_bp if we inserted it into the cache
+ * - the buffer we found and locked.
*/
-xfs_buf_t *
-_xfs_buf_find(
+static int
+xfs_buf_find(
struct xfs_buftarg *btp,
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
- xfs_buf_t *new_bp)
+ struct xfs_buf *new_bp,
+ struct xfs_buf **found_bp)
{
struct xfs_perag *pag;
xfs_buf_t *bp;
@@ -567,6 +581,8 @@ _xfs_buf_find(
xfs_daddr_t eofs;
int i;
+ *found_bp = NULL;
+
for (i = 0; i < nmaps; i++)
cmap.bm_len += map[i].bm_len;
@@ -580,16 +596,11 @@ _xfs_buf_find(
*/
eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
- /*
- * XXX (dgc): we should really be returning -EFSCORRUPTED here,
- * but none of the higher level infrastructure supports
- * returning a specific error on buffer lookup failures.
- */
xfs_alert(btp->bt_mount,
"%s: daddr 0x%llx out of range, EOFS 0x%llx",
__func__, cmap.bm_bn, eofs);
WARN_ON(1);
- return NULL;
+ return -EFSCORRUPTED;
}
pag = xfs_perag_get(btp->bt_mount,
@@ -604,19 +615,20 @@ _xfs_buf_find(
}
/* No match found */
- if (new_bp) {
- /* the buffer keeps the perag reference until it is freed */
- new_bp->b_pag = pag;
- rhashtable_insert_fast(&pag->pag_buf_hash,
- &new_bp->b_rhash_head,
- xfs_buf_hash_params);
- spin_unlock(&pag->pag_buf_lock);
- } else {
+ if (!new_bp) {
XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
+ return -ENOENT;
}
- return new_bp;
+
+ /* the buffer keeps the perag reference until it is freed */
+ new_bp->b_pag = pag;
+ rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
+ xfs_buf_hash_params);
+ spin_unlock(&pag->pag_buf_lock);
+ *found_bp = new_bp;
+ return 0;
found:
spin_unlock(&pag->pag_buf_lock);
@@ -626,7 +638,7 @@ found:
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
- return NULL;
+ return -EAGAIN;
}
xfs_buf_lock(bp);
XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
@@ -646,6 +658,24 @@ found:
trace_xfs_buf_find(bp, flags, _RET_IP_);
XFS_STATS_INC(btp->bt_mount, xb_get_locked);
+ *found_bp = bp;
+ return 0;
+}
+
+struct xfs_buf *
+xfs_buf_incore(
+ struct xfs_buftarg *target,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ xfs_buf_flags_t flags)
+{
+ struct xfs_buf *bp;
+ int error;
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+
+ error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
+ if (error)
+ return NULL;
return bp;
}
@@ -665,9 +695,27 @@ xfs_buf_get_map(
struct xfs_buf *new_bp;
int error = 0;
- bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
- if (likely(bp))
+ error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
+
+ switch (error) {
+ case 0:
+ /* cache hit */
goto found;
+ case -EAGAIN:
+ /* cache hit, trylock failure, caller handles failure */
+ ASSERT(flags & XBF_TRYLOCK);
+ return NULL;
+ case -ENOENT:
+ /* cache miss, go for insert */
+ break;
+ case -EFSCORRUPTED:
+ default:
+ /*
+ * None of the higher layers understand failure types
+ * yet, so return NULL to signal a fatal lookup error.
+ */
+ return NULL;
+ }
new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
if (unlikely(!new_bp))
@@ -679,8 +727,8 @@ xfs_buf_get_map(
return NULL;
}
- bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
- if (!bp) {
+ error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
+ if (error) {
xfs_buf_free(new_bp);
return NULL;
}
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index edced162a674..f5f2b71c2fde 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -218,20 +218,9 @@ typedef struct xfs_buf {
} xfs_buf_t;
/* Finding and Reading Buffers */
-struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags, struct xfs_buf *new_bp);
-
-static inline struct xfs_buf *
-xfs_incore(
- struct xfs_buftarg *target,
- xfs_daddr_t blkno,
- size_t numblks,
- xfs_buf_flags_t flags)
-{
- DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return _xfs_buf_find(target, &map, 1, flags, NULL);
-}
+struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target,
+ xfs_daddr_t blkno, size_t numblks,
+ xfs_buf_flags_t flags);
struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
@@ -358,6 +347,18 @@ extern void xfs_buf_terminate(void);
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
+/*
+ * If the buffer is already on the LRU, do nothing. Otherwise set the buffer
+ * up with a reference count of 0 so it will be tossed from the cache when
+ * released.
+ */
+static inline void xfs_buf_oneshot(struct xfs_buf *bp)
+{
+ if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1)
+ return;
+ atomic_set(&bp->b_lru_ref, 0);
+}
+
static inline int xfs_buf_ispinned(struct xfs_buf *bp)
{
return atomic_read(&bp->b_pin_count);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 82ad270e390e..c2311379d1c3 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -438,7 +438,7 @@ xfs_buf_item_unpin(
* xfs_trans_uncommit() will try to reference the
* buffer which we no longer have a hold on.
*/
- if (lip->li_desc)
+ if (!list_empty(&lip->li_trans))
xfs_trans_del_item(lip);
/*
@@ -568,13 +568,15 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool aborted = !!(lip->li_flags & XFS_LI_ABORTED);
+ bool aborted;
bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
#if defined(DEBUG) || defined(XFS_WARN)
bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
#endif
+ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags);
+
/* Clear the buffer's association with this transaction. */
bp->b_transp = NULL;
@@ -743,8 +745,10 @@ xfs_buf_item_init(
* nothing to do here so return.
*/
ASSERT(bp->b_target->bt_mount == mp);
- if (bip != NULL) {
+ if (bip) {
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ ASSERT(!bp->b_transp);
+ ASSERT(bip->bli_buf == bp);
return 0;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a7daef9e16bf..2567391489bd 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -288,49 +288,43 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
}
/*
- * Allocate a block and fill it with dquots.
- * This is called when the bmapi finds a hole.
+ * Ensure that the given in-core dquot has a buffer on disk backing it, and
+ * return the buffer. This is called when the bmapi finds a hole.
*/
STATIC int
-xfs_qm_dqalloc(
- xfs_trans_t **tpp,
- xfs_mount_t *mp,
- xfs_dquot_t *dqp,
- xfs_inode_t *quotip,
- xfs_fileoff_t offset_fsb,
- xfs_buf_t **O_bpp)
+xfs_dquot_disk_alloc(
+ struct xfs_trans **tpp,
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
{
- xfs_fsblock_t firstblock;
- struct xfs_defer_ops dfops;
- xfs_bmbt_irec_t map;
- int nmaps, error;
- xfs_buf_t *bp;
- xfs_trans_t *tp = *tpp;
-
- ASSERT(tp != NULL);
+ struct xfs_bmbt_irec map;
+ struct xfs_defer_ops dfops;
+ struct xfs_mount *mp = (*tpp)->t_mountp;
+ struct xfs_buf *bp;
+ struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
+ xfs_fsblock_t firstblock;
+ int nmaps = 1;
+ int error;
trace_xfs_dqalloc(dqp);
- /*
- * Initialize the bmap freelist prior to calling bmapi code.
- */
xfs_defer_init(&dfops, &firstblock);
xfs_ilock(quotip, XFS_ILOCK_EXCL);
- /*
- * Return if this type of quotas is turned off while we didn't
- * have an inode lock
- */
if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+ /*
+ * Return if this type of quotas is turned off while we didn't
+ * have an inode lock
+ */
xfs_iunlock(quotip, XFS_ILOCK_EXCL);
return -ESRCH;
}
- xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
- nmaps = 1;
- error = xfs_bmapi_write(tp, quotip, offset_fsb,
- XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
- &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
- &map, &nmaps, &dfops);
+ /* Create the block mapping. */
+ xfs_trans_ijoin(*tpp, quotip, XFS_ILOCK_EXCL);
+ error = xfs_bmapi_write(*tpp, quotip, dqp->q_fileoffset,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+ &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp),
+ &map, &nmaps, &dfops);
if (error)
goto error0;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
@@ -344,10 +338,8 @@ xfs_qm_dqalloc(
dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
/* now we can just get the buffer (there's nothing to read yet) */
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
- dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen,
- 0);
+ bp = xfs_trans_get_buf(*tpp, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0);
if (!bp) {
error = -ENOMEM;
goto error1;
@@ -358,37 +350,45 @@ xfs_qm_dqalloc(
* Make a chunk of dquots out of this buffer and log
* the entire thing.
*/
- xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
+ xfs_qm_init_dquot_blk(*tpp, mp, be32_to_cpu(dqp->q_core.d_id),
dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
- * xfs_defer_finish() may commit the current transaction and
- * start a second transaction if the freelist is not empty.
+ * Hold the buffer and join it to the dfops so that we'll still own
+ * the buffer when we return to the caller. The buffer disposal on
+ * error must be paid attention to very carefully, as it has been
+ * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota
+ * code when allocating a new dquot record" in 2005, and the later
+ * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep
+ * the buffer locked across the _defer_finish call. We can now do
+ * this correctly with xfs_defer_bjoin.
*
- * Since we still want to modify this buffer, we need to
- * ensure that the buffer is not released on commit of
- * the first transaction and ensure the buffer is added to the
- * second transaction.
+ * Above, we allocated a disk block for the dquot information and
+ * used get_buf to initialize the dquot. If the _defer_bjoin fails,
+ * the buffer is still locked to *tpp, so we must _bhold_release and
+ * then _trans_brelse the buffer. If the _defer_finish fails, the old
+ * transaction is gone but the new buffer is not joined or held to any
+ * transaction, so we must _buf_relse it.
*
- * If there is only one transaction then don't stop the buffer
- * from being released when it commits later on.
+ * If everything succeeds, the caller of this function is returned a
+ * buffer that is locked and held to the transaction. The caller
+ * is responsible for unlocking any buffer passed back, either
+ * manually or by committing the transaction.
*/
-
- xfs_trans_bhold(tp, bp);
-
+ xfs_trans_bhold(*tpp, bp);
+ error = xfs_defer_bjoin(&dfops, bp);
+ if (error) {
+ xfs_trans_bhold_release(*tpp, bp);
+ xfs_trans_brelse(*tpp, bp);
+ goto error1;
+ }
error = xfs_defer_finish(tpp, &dfops);
- if (error)
+ if (error) {
+ xfs_buf_relse(bp);
goto error1;
-
- /* Transaction was committed? */
- if (*tpp != tp) {
- tp = *tpp;
- xfs_trans_bjoin(tp, bp);
- } else {
- xfs_trans_bhold_release(tp, bp);
}
-
- *O_bpp = bp;
+ *bpp = bp;
return 0;
error1:
@@ -398,32 +398,24 @@ error0:
}
/*
- * Maps a dquot to the buffer containing its on-disk version.
- * This returns a ptr to the buffer containing the on-disk dquot
- * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ * Read in the in-core dquot's on-disk metadata and return the buffer.
+ * Returns ENOENT to signal a hole.
*/
STATIC int
-xfs_qm_dqtobp(
- xfs_trans_t **tpp,
- xfs_dquot_t *dqp,
- xfs_disk_dquot_t **O_ddpp,
- xfs_buf_t **O_bpp,
- uint flags)
+xfs_dquot_disk_read(
+ struct xfs_mount *mp,
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
{
struct xfs_bmbt_irec map;
- int nmaps = 1, error;
struct xfs_buf *bp;
- struct xfs_inode *quotip;
- struct xfs_mount *mp = dqp->q_mount;
- xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
- struct xfs_trans *tp = (tpp ? *tpp : NULL);
+ struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags);
uint lock_mode;
-
- quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags);
- dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+ int nmaps = 1;
+ int error;
lock_mode = xfs_ilock_data_map_shared(quotip);
- if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
+ if (!xfs_this_quota_on(mp, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
@@ -436,81 +428,48 @@ xfs_qm_dqtobp(
* Find the block map; no allocations yet
*/
error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
- XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
-
+ XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
xfs_iunlock(quotip, lock_mode);
if (error)
return error;
ASSERT(nmaps == 1);
- ASSERT(map.br_blockcount == 1);
+ ASSERT(map.br_blockcount >= 1);
+ ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+ if (map.br_startblock == HOLESTARTBLOCK)
+ return -ENOENT;
+
+ trace_xfs_dqtobp_read(dqp);
/*
- * Offset of dquot in the (fixed sized) dquot chunk.
+ * store the blkno etc so that we don't have to do the
+ * mapping all the time
*/
- dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
- sizeof(xfs_dqblk_t);
-
- ASSERT(map.br_startblock != DELAYSTARTBLOCK);
- if (map.br_startblock == HOLESTARTBLOCK) {
- /*
- * We don't allocate unless we're asked to
- */
- if (!(flags & XFS_QMOPT_DQALLOC))
- return -ENOENT;
-
- ASSERT(tp);
- error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
- dqp->q_fileoffset, &bp);
- if (error)
- return error;
- tp = *tpp;
- } else {
- trace_xfs_dqtobp_read(dqp);
+ dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
- /*
- * store the blkno etc so that we don't have to do the
- * mapping all the time
- */
- dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen,
- 0, &bp, &xfs_dquot_buf_ops);
- if (error) {
- ASSERT(bp == NULL);
- return error;
- }
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ &xfs_dquot_buf_ops);
+ if (error) {
+ ASSERT(bp == NULL);
+ return error;
}
ASSERT(xfs_buf_islocked(bp));
- *O_bpp = bp;
- *O_ddpp = bp->b_addr + dqp->q_bufoffset;
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+ *bpp = bp;
return 0;
}
-
-/*
- * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
- * and release the buffer immediately.
- *
- * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed.
- */
-int
-xfs_qm_dqread(
+/* Allocate and initialize everything we need for an incore dquot. */
+STATIC struct xfs_dquot *
+xfs_dquot_alloc(
struct xfs_mount *mp,
xfs_dqid_t id,
- uint type,
- uint flags,
- struct xfs_dquot **O_dqpp)
+ uint type)
{
struct xfs_dquot *dqp;
- struct xfs_disk_dquot *ddqp;
- struct xfs_buf *bp;
- struct xfs_trans *tp = NULL;
- int error;
dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP);
@@ -520,6 +479,12 @@ xfs_qm_dqread(
INIT_LIST_HEAD(&dqp->q_lru);
mutex_init(&dqp->q_qlock);
init_waitqueue_head(&dqp->q_pinwait);
+ dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+ /*
+ * Offset of dquot in the (fixed sized) dquot chunk.
+ */
+ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+ sizeof(xfs_dqblk_t);
/*
* Because we want to use a counting completion, complete
@@ -548,35 +513,22 @@ xfs_qm_dqread(
break;
}
- XFS_STATS_INC(mp, xs_qm_dquot);
-
- trace_xfs_dqread(dqp);
+ xfs_qm_dquot_logitem_init(dqp);
- if (flags & XFS_QMOPT_DQALLOC) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
- XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
- if (error)
- goto error0;
- }
+ XFS_STATS_INC(mp, xs_qm_dquot);
+ return dqp;
+}
- /*
- * get a pointer to the on-disk dquot and the buffer containing it
- * dqp already knows its own type (GROUP/USER).
- */
- error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags);
- if (error) {
- /*
- * This can happen if quotas got turned off (ESRCH),
- * or if the dquot didn't exist on disk and we ask to
- * allocate (ENOENT).
- */
- trace_xfs_dqread_fail(dqp);
- goto error1;
- }
+/* Copy the in-core quota fields in from the on-disk buffer. */
+STATIC void
+xfs_dquot_from_disk(
+ struct xfs_dquot *dqp,
+ struct xfs_buf *bp)
+{
+ struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
/* copy everything from disk dquot to the incore dquot */
memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
- xfs_qm_dquot_logitem_init(dqp);
/*
* Reservation counters are defined as reservation plus current usage
@@ -588,40 +540,90 @@ xfs_qm_dqread(
/* initialize the dquot speculative prealloc thresholds */
xfs_dquot_set_prealloc_limits(dqp);
+}
- /* Mark the buf so that this will stay incore a little longer */
- xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+/* Allocate and initialize the dquot buffer for this in-core dquot. */
+static int
+xfs_qm_dqread_alloc(
+ struct xfs_mount *mp,
+ struct xfs_dquot *dqp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_trans *tp;
+ struct xfs_buf *bp;
+ int error;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc,
+ XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp);
+ if (error)
+ goto err;
+
+ error = xfs_dquot_disk_alloc(&tp, dqp, &bp);
+ if (error)
+ goto err_cancel;
+
+ error = xfs_trans_commit(tp);
+ if (error) {
+ /*
+ * Buffer was held to the transaction, so we have to unlock it
+ * manually here because we're not passing it back.
+ */
+ xfs_buf_relse(bp);
+ goto err;
+ }
+ *bpp = bp;
+ return 0;
+
+err_cancel:
+ xfs_trans_cancel(tp);
+err:
+ return error;
+}
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately. If @can_alloc is true, fill any
+ * holes in the on-disk metadata.
+ */
+static int
+xfs_qm_dqread(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ bool can_alloc,
+ struct xfs_dquot **dqpp)
+{
+ struct xfs_dquot *dqp;
+ struct xfs_buf *bp;
+ int error;
+
+ dqp = xfs_dquot_alloc(mp, id, type);
+ trace_xfs_dqread(dqp);
+
+ /* Try to read the buffer, allocating if necessary. */
+ error = xfs_dquot_disk_read(mp, dqp, &bp);
+ if (error == -ENOENT && can_alloc)
+ error = xfs_qm_dqread_alloc(mp, dqp, &bp);
+ if (error)
+ goto err;
/*
- * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
- * So we need to release with xfs_trans_brelse().
- * The strategy here is identical to that of inodes; we lock
- * the dquot in xfs_qm_dqget() before making it accessible to
- * others. This is because dquots, like inodes, need a good level of
- * concurrency, and we don't want to take locks on the entire buffers
- * for dquot accesses.
- * Note also that the dquot buffer may even be dirty at this point, if
- * this particular dquot was repaired. We still aren't afraid to
- * brelse it because we have the changes incore.
+ * At this point we should have a clean locked buffer. Copy the data
+ * to the incore dquot and release the buffer since the incore dquot
+ * has its own locking protocol so we needn't tie up the buffer any
+ * further.
*/
ASSERT(xfs_buf_islocked(bp));
- xfs_trans_brelse(tp, bp);
+ xfs_dquot_from_disk(dqp, bp);
- if (tp) {
- error = xfs_trans_commit(tp);
- if (error)
- goto error0;
- }
-
- *O_dqpp = dqp;
+ xfs_buf_relse(bp);
+ *dqpp = dqp;
return error;
-error1:
- if (tp)
- xfs_trans_cancel(tp);
-error0:
+err:
+ trace_xfs_dqread_fail(dqp);
xfs_qm_dqdestroy(dqp);
- *O_dqpp = NULL;
+ *dqpp = NULL;
return error;
}
@@ -679,77 +681,230 @@ xfs_dq_get_next_id(
}
/*
- * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
- * a locked dquot, doing an allocation (if requested) as needed.
- * When both an inode and an id are given, the inode's id takes precedence.
- * That is, if the id changes while we don't hold the ilock inside this
- * function, the new dquot is returned, not necessarily the one requested
- * in the id argument.
+ * Look up the dquot in the in-core cache. If found, the dquot is returned
+ * locked and ready to go.
+ */
+static struct xfs_dquot *
+xfs_qm_dqget_cache_lookup(
+ struct xfs_mount *mp,
+ struct xfs_quotainfo *qi,
+ struct radix_tree_root *tree,
+ xfs_dqid_t id)
+{
+ struct xfs_dquot *dqp;
+
+restart:
+ mutex_lock(&qi->qi_tree_lock);
+ dqp = radix_tree_lookup(tree, id);
+ if (!dqp) {
+ mutex_unlock(&qi->qi_tree_lock);
+ XFS_STATS_INC(mp, xs_qm_dqcachemisses);
+ return NULL;
+ }
+
+ xfs_dqlock(dqp);
+ if (dqp->dq_flags & XFS_DQ_FREEING) {
+ xfs_dqunlock(dqp);
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_freeing(dqp);
+ delay(1);
+ goto restart;
+ }
+
+ dqp->q_nrefs++;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ trace_xfs_dqget_hit(dqp);
+ XFS_STATS_INC(mp, xs_qm_dqcachehits);
+ return dqp;
+}
+
+/*
+ * Try to insert a new dquot into the in-core cache. If an error occurs the
+ * caller should throw away the dquot and start over. Otherwise, the dquot
+ * is returned locked (and held by the cache) as if there had been a cache
+ * hit.
+ */
+static int
+xfs_qm_dqget_cache_insert(
+ struct xfs_mount *mp,
+ struct xfs_quotainfo *qi,
+ struct radix_tree_root *tree,
+ xfs_dqid_t id,
+ struct xfs_dquot *dqp)
+{
+ int error;
+
+ mutex_lock(&qi->qi_tree_lock);
+ error = radix_tree_insert(tree, id, dqp);
+ if (unlikely(error)) {
+ /* Duplicate found! Caller must try again. */
+ WARN_ON(error != -EEXIST);
+ mutex_unlock(&qi->qi_tree_lock);
+ trace_xfs_dqget_dup(dqp);
+ return error;
+ }
+
+ /* Return a locked dquot to the caller, with a reference taken. */
+ xfs_dqlock(dqp);
+ dqp->q_nrefs = 1;
+
+ qi->qi_dquots++;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ return 0;
+}
+
+/* Check our input parameters. */
+static int
+xfs_qm_dqget_checks(
+ struct xfs_mount *mp,
+ uint type)
+{
+ if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp)))
+ return -ESRCH;
+
+ switch (type) {
+ case XFS_DQ_USER:
+ if (!XFS_IS_UQUOTA_ON(mp))
+ return -ESRCH;
+ return 0;
+ case XFS_DQ_GROUP:
+ if (!XFS_IS_GQUOTA_ON(mp))
+ return -ESRCH;
+ return 0;
+ case XFS_DQ_PROJ:
+ if (!XFS_IS_PQUOTA_ON(mp))
+ return -ESRCH;
+ return 0;
+ default:
+ WARN_ON_ONCE(0);
+ return -EINVAL;
+ }
+}
+
+/*
+ * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked
+ * dquot, doing an allocation (if requested) as needed.
*/
int
xfs_qm_dqget(
- xfs_mount_t *mp,
- xfs_inode_t *ip, /* locked inode (optional) */
- xfs_dqid_t id, /* uid/projid/gid depending on type */
- uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
- uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
- xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ bool can_alloc,
+ struct xfs_dquot **O_dqpp)
{
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
struct xfs_dquot *dqp;
int error;
- ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
- (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
- (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
- return -ESRCH;
+ error = xfs_qm_dqget_checks(mp, type);
+ if (error)
+ return error;
+
+restart:
+ dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
+ if (dqp) {
+ *O_dqpp = dqp;
+ return 0;
}
- ASSERT(type == XFS_DQ_USER ||
- type == XFS_DQ_PROJ ||
- type == XFS_DQ_GROUP);
- if (ip) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(xfs_inode_dquot(ip, type) == NULL);
+ error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+ if (error)
+ return error;
+
+ error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+ if (error) {
+ /*
+ * Duplicate found. Just throw away the new dquot and start
+ * over.
+ */
+ xfs_qm_dqdestroy(dqp);
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
+ goto restart;
}
-restart:
- mutex_lock(&qi->qi_tree_lock);
- dqp = radix_tree_lookup(tree, id);
- if (dqp) {
- xfs_dqlock(dqp);
- if (dqp->dq_flags & XFS_DQ_FREEING) {
- xfs_dqunlock(dqp);
- mutex_unlock(&qi->qi_tree_lock);
- trace_xfs_dqget_freeing(dqp);
- delay(1);
- goto restart;
- }
+ trace_xfs_dqget_miss(dqp);
+ *O_dqpp = dqp;
+ return 0;
+}
- /* uninit / unused quota found in radix tree, keep looking */
- if (flags & XFS_QMOPT_DQNEXT) {
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- xfs_dqunlock(dqp);
- mutex_unlock(&qi->qi_tree_lock);
- error = xfs_dq_get_next_id(mp, type, &id);
- if (error)
- return error;
- goto restart;
- }
- }
+/*
+ * Given a dquot id and type, read and initialize a dquot from the on-disk
+ * metadata. This function is only for use during quota initialization so
+ * it ignores the dquot cache assuming that the dquot shrinker isn't set up.
+ * The caller is responsible for _qm_dqdestroy'ing the returned dquot.
+ */
+int
+xfs_qm_dqget_uncached(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ struct xfs_dquot **dqpp)
+{
+ int error;
- dqp->q_nrefs++;
- mutex_unlock(&qi->qi_tree_lock);
+ error = xfs_qm_dqget_checks(mp, type);
+ if (error)
+ return error;
+
+ return xfs_qm_dqread(mp, id, type, 0, dqpp);
+}
+
+/* Return the quota id for a given inode and type. */
+xfs_dqid_t
+xfs_qm_id_for_quotatype(
+ struct xfs_inode *ip,
+ uint type)
+{
+ switch (type) {
+ case XFS_DQ_USER:
+ return ip->i_d.di_uid;
+ case XFS_DQ_GROUP:
+ return ip->i_d.di_gid;
+ case XFS_DQ_PROJ:
+ return xfs_get_projid(ip);
+ }
+ ASSERT(0);
+ return 0;
+}
+
+/*
+ * Return the dquot for a given inode and type. If @can_alloc is true, then
+ * allocate blocks if needed. The inode's ILOCK must be held and it must not
+ * have already had an inode attached.
+ */
+int
+xfs_qm_dqget_inode(
+ struct xfs_inode *ip,
+ uint type,
+ bool can_alloc,
+ struct xfs_dquot **O_dqpp)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, type);
+ struct xfs_dquot *dqp;
+ xfs_dqid_t id;
+ int error;
- trace_xfs_dqget_hit(dqp);
- XFS_STATS_INC(mp, xs_qm_dqcachehits);
+ error = xfs_qm_dqget_checks(mp, type);
+ if (error)
+ return error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(xfs_inode_dquot(ip, type) == NULL);
+
+ id = xfs_qm_id_for_quotatype(ip, type);
+
+restart:
+ dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id);
+ if (dqp) {
*O_dqpp = dqp;
return 0;
}
- mutex_unlock(&qi->qi_tree_lock);
- XFS_STATS_INC(mp, xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -758,87 +913,81 @@ restart:
* lock here means dealing with a chown that can happen before
* we re-acquire the lock.
*/
- if (ip)
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- error = xfs_qm_dqread(mp, id, type, flags, &dqp);
-
- if (ip)
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- /* If we are asked to find next active id, keep looking */
- if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) {
- error = xfs_dq_get_next_id(mp, type, &id);
- if (!error)
- goto restart;
- }
-
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
- if (ip) {
- /*
- * A dquot could be attached to this inode by now, since
- * we had dropped the ilock.
- */
- if (xfs_this_quota_on(mp, type)) {
- struct xfs_dquot *dqp1;
-
- dqp1 = xfs_inode_dquot(ip, type);
- if (dqp1) {
- xfs_qm_dqdestroy(dqp);
- dqp = dqp1;
- xfs_dqlock(dqp);
- goto dqret;
- }
- } else {
- /* inode stays locked on return */
+ /*
+ * A dquot could be attached to this inode by now, since we had
+ * dropped the ilock.
+ */
+ if (xfs_this_quota_on(mp, type)) {
+ struct xfs_dquot *dqp1;
+
+ dqp1 = xfs_inode_dquot(ip, type);
+ if (dqp1) {
xfs_qm_dqdestroy(dqp);
- return -ESRCH;
+ dqp = dqp1;
+ xfs_dqlock(dqp);
+ goto dqret;
}
+ } else {
+ /* inode stays locked on return */
+ xfs_qm_dqdestroy(dqp);
+ return -ESRCH;
}
- mutex_lock(&qi->qi_tree_lock);
- error = radix_tree_insert(tree, id, dqp);
- if (unlikely(error)) {
- WARN_ON(error != -EEXIST);
-
+ error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp);
+ if (error) {
/*
* Duplicate found. Just throw away the new dquot and start
* over.
*/
- mutex_unlock(&qi->qi_tree_lock);
- trace_xfs_dqget_dup(dqp);
xfs_qm_dqdestroy(dqp);
XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
- /*
- * We return a locked dquot to the caller, with a reference taken
- */
- xfs_dqlock(dqp);
- dqp->q_nrefs = 1;
+dqret:
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ trace_xfs_dqget_miss(dqp);
+ *O_dqpp = dqp;
+ return 0;
+}
- qi->qi_dquots++;
- mutex_unlock(&qi->qi_tree_lock);
+/*
+ * Starting at @id and progressing upwards, look for an initialized incore
+ * dquot, lock it, and return it.
+ */
+int
+xfs_qm_dqget_next(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ struct xfs_dquot **dqpp)
+{
+ struct xfs_dquot *dqp;
+ int error = 0;
- /* If we are asked to find next active id, keep looking */
- if (flags & XFS_QMOPT_DQNEXT) {
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- xfs_qm_dqput(dqp);
- error = xfs_dq_get_next_id(mp, type, &id);
- if (error)
- return error;
- goto restart;
+ *dqpp = NULL;
+ for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) {
+ error = xfs_qm_dqget(mp, id, type, false, &dqp);
+ if (error == -ENOENT)
+ continue;
+ else if (error != 0)
+ break;
+
+ if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ *dqpp = dqp;
+ return 0;
}
+
+ xfs_qm_dqput(dqp);
}
- dqret:
- ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
- trace_xfs_dqget_miss(dqp);
- *O_dqpp = dqp;
- return 0;
+ return error;
}
/*
@@ -913,9 +1062,9 @@ xfs_qm_dqflush_done(
* since it's cheaper, and then we recheck while
* holding the lock before removing the dquot from the AIL.
*/
- if ((lip->li_flags & XFS_LI_IN_AIL) &&
+ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
((lip->li_lsn == qip->qli_flush_lsn) ||
- (lip->li_flags & XFS_LI_FAILED))) {
+ test_bit(XFS_LI_FAILED, &lip->li_flags))) {
/* xfs_trans_ail_delete() drops the AIL lock. */
spin_lock(&ailp->ail_lock);
@@ -926,8 +1075,7 @@ xfs_qm_dqflush_done(
* Clear the failed state since we are about to drop the
* flush lock
*/
- if (lip->li_flags & XFS_LI_FAILED)
- xfs_clear_li_failed(lip);
+ xfs_clear_li_failed(lip);
spin_unlock(&ailp->ail_lock);
}
}
@@ -953,6 +1101,7 @@ xfs_qm_dqflush(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_buf *bp;
+ struct xfs_dqblk *dqb;
struct xfs_disk_dquot *ddqp;
xfs_failaddr_t fa;
int error;
@@ -996,12 +1145,13 @@ xfs_qm_dqflush(
/*
* Calculate the location of the dquot inside the buffer.
*/
- ddqp = bp->b_addr + dqp->q_bufoffset;
+ dqb = bp->b_addr + dqp->q_bufoffset;
+ ddqp = &dqb->dd_diskdq;
/*
- * A simple sanity check in case we got a corrupted dquot..
+ * A simple sanity check in case we got a corrupted dquot.
*/
- fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 0);
+ fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
be32_to_cpu(ddqp->d_id), fa);
@@ -1032,8 +1182,6 @@ xfs_qm_dqflush(
* of a dquot without an up-to-date CRC getting to disk.
*/
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp;
-
dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn);
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -1119,3 +1267,35 @@ xfs_qm_exit(void)
kmem_zone_destroy(xfs_qm_dqtrxzone);
kmem_zone_destroy(xfs_qm_dqzone);
}
+
+/*
+ * Iterate every dquot of a particular type. The caller must ensure that the
+ * particular quota type is active. iter_fn can return negative error codes,
+ * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating.
+ */
+int
+xfs_qm_dqiterate(
+ struct xfs_mount *mp,
+ uint dqtype,
+ xfs_qm_dqiterate_fn iter_fn,
+ void *priv)
+{
+ struct xfs_dquot *dq;
+ xfs_dqid_t id = 0;
+ int error;
+
+ do {
+ error = xfs_qm_dqget_next(mp, id, dqtype, &dq);
+ if (error == -ENOENT)
+ return 0;
+ if (error)
+ return error;
+
+ error = iter_fn(dq, dqtype, priv);
+ id = be32_to_cpu(dq->q_core.d_id);
+ xfs_qm_dqput(dq);
+ id++;
+ } while (error == 0 && id != 0);
+
+ return error;
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 2f536f33cd26..bdd6bd921528 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -160,8 +160,6 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
- uint, struct xfs_dquot **);
extern void xfs_qm_dqdestroy(xfs_dquot_t *);
extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
@@ -169,8 +167,19 @@ extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
extern void xfs_qm_adjust_dqlimits(struct xfs_mount *,
struct xfs_dquot *);
-extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
- xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip,
+ uint type);
+extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
+ uint type, bool can_alloc,
+ struct xfs_dquot **dqpp);
+extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
+ bool can_alloc,
+ struct xfs_dquot **dqpp);
+extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
+ uint type, struct xfs_dquot **dqpp);
+extern int xfs_qm_dqget_uncached(struct xfs_mount *mp,
+ xfs_dqid_t id, uint type,
+ struct xfs_dquot **dqpp);
extern void xfs_qm_dqput(xfs_dquot_t *);
extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
@@ -185,4 +194,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
+typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype,
+ void *priv);
+int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype,
+ xfs_qm_dqiterate_fn iter_fn, void *priv);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 4b331e354da7..8eb7415474d6 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -173,7 +173,7 @@ xfs_qm_dquot_logitem_push(
* The buffer containing this item failed to be written back
* previously. Resubmit the buffer for IO
*/
- if (lip->li_flags & XFS_LI_FAILED) {
+ if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
@@ -209,10 +209,7 @@ xfs_qm_dquot_logitem_push(
spin_unlock(&lip->li_ailp->ail_lock);
error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(dqp->q_mount, "%s: push error %d on dqp "PTR_FMT,
- __func__, error, dqp);
- } else {
+ if (!error) {
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index a63f5083f497..7975634cb8fe 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -61,6 +61,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_LOG_BAD_CRC,
XFS_RANDOM_LOG_ITEM_PIN,
XFS_RANDOM_BUF_LRU_REF,
+ XFS_RANDOM_FORCE_SCRUB_REPAIR,
};
struct xfs_errortag_attr {
@@ -167,6 +168,7 @@ XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
+XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -201,6 +203,7 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
XFS_ERRORTAG_ATTR_LIST(log_item_pin),
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
+ XFS_ERRORTAG_ATTR_LIST(force_repair),
NULL,
};
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index b5b1e567b9f4..a889b550979a 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -168,7 +168,7 @@ STATIC void
xfs_efi_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
xfs_efi_release(EFI_ITEM(lip));
}
@@ -402,7 +402,7 @@ xfs_efd_item_unlock(
{
struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
- if (lip->li_flags & XFS_LI_ABORTED) {
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
xfs_efi_release(efdp->efd_efip);
xfs_efd_item_free(efdp);
}
@@ -542,7 +542,7 @@ xfs_efi_recover(
for (i = 0; i < efip->efi_format.efi_nextents; i++) {
extp = &efip->efi_format.efi_extents[i];
error = xfs_trans_free_extent(tp, efdp, extp->ext_start,
- extp->ext_len, &oinfo);
+ extp->ext_len, &oinfo, false);
if (error)
goto abort_error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e70fb8ccecea..bed07dfbb85e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -312,7 +312,7 @@ restart:
if (error <= 0)
return error;
- error = xfs_break_layouts(inode, iolock);
+ error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
if (error)
return error;
@@ -414,6 +414,12 @@ xfs_dio_write_end_io(
if (size <= 0)
return size;
+ /*
+ * Capture amount written on completion as we can't reliably account
+ * for it on submission.
+ */
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
+
if (flags & IOMAP_DIO_COW) {
error = xfs_reflink_end_cow(ip, offset, size);
if (error)
@@ -599,7 +605,16 @@ xfs_file_dax_write(
}
out:
xfs_iunlock(ip, iolock);
- return error ? error : ret;
+ if (error)
+ return error;
+
+ if (ret > 0) {
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+
+ /* Handle various SYNC-type writes */
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
}
STATIC ssize_t
@@ -669,6 +684,12 @@ write_retry:
out:
if (iolock)
xfs_iunlock(ip, iolock);
+
+ if (ret > 0) {
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+ /* Handle various SYNC-type writes */
+ ret = generic_write_sync(iocb, ret);
+ }
return ret;
}
@@ -693,8 +714,9 @@ xfs_file_write_iter(
return -EIO;
if (IS_DAX(inode))
- ret = xfs_file_dax_write(iocb, from);
- else if (iocb->ki_flags & IOCB_DIRECT) {
+ return xfs_file_dax_write(iocb, from);
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
* write *only* in the case that we're doing a reflink
@@ -702,20 +724,74 @@ xfs_file_write_iter(
* allow an operation to fall back to buffered mode.
*/
ret = xfs_file_dio_aio_write(iocb, from);
- if (ret == -EREMCHG)
- goto buffered;
- } else {
-buffered:
- ret = xfs_file_buffered_aio_write(iocb, from);
+ if (ret != -EREMCHG)
+ return ret;
}
- if (ret > 0) {
- XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
+ return xfs_file_buffered_aio_write(iocb, from);
+}
- /* Handle various SYNC-type writes */
- ret = generic_write_sync(iocb, ret);
- }
- return ret;
+static void
+xfs_wait_dax_page(
+ struct inode *inode,
+ bool *did_unlock)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ *did_unlock = true;
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+ schedule();
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+}
+
+static int
+xfs_break_dax_layouts(
+ struct inode *inode,
+ uint iolock,
+ bool *did_unlock)
+{
+ struct page *page;
+
+ ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+
+ page = dax_layout_busy_page(inode->i_mapping);
+ if (!page)
+ return 0;
+
+ return ___wait_var_event(&page->_refcount,
+ atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
+ 0, 0, xfs_wait_dax_page(inode, did_unlock));
+}
+
+int
+xfs_break_layouts(
+ struct inode *inode,
+ uint *iolock,
+ enum layout_break_reason reason)
+{
+ bool retry;
+ int error;
+
+ ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+
+ do {
+ retry = false;
+ switch (reason) {
+ case BREAK_UNMAP:
+ error = xfs_break_dax_layouts(inode, *iolock, &retry);
+ if (error || retry)
+ break;
+ /* fall through */
+ case BREAK_WRITE:
+ error = xfs_break_leased_layouts(inode, iolock, &retry);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ error = -EINVAL;
+ }
+ } while (error == 0 && retry);
+
+ return error;
}
#define XFS_FALLOC_FL_SUPPORTED \
@@ -734,7 +810,7 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
enum xfs_prealloc_flags flags = 0;
- uint iolock = XFS_IOLOCK_EXCL;
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
loff_t new_size = 0;
bool do_file_insert = false;
@@ -744,13 +820,10 @@ xfs_file_fallocate(
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock);
+ error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
goto out_unlock;
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- iolock |= XFS_MMAPLOCK_EXCL;
-
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
@@ -1007,7 +1080,7 @@ xfs_file_llseek(
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
-static int
+static vm_fault_t
__xfs_filemap_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size,
@@ -1015,7 +1088,7 @@ __xfs_filemap_fault(
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
- int ret;
+ vm_fault_t ret;
trace_xfs_filemap_fault(ip, pe_size, write_fault);
@@ -1044,7 +1117,7 @@ __xfs_filemap_fault(
return ret;
}
-static int
+static vm_fault_t
xfs_filemap_fault(
struct vm_fault *vmf)
{
@@ -1054,7 +1127,7 @@ xfs_filemap_fault(
(vmf->flags & FAULT_FLAG_WRITE));
}
-static int
+static vm_fault_t
xfs_filemap_huge_fault(
struct vm_fault *vmf,
enum page_entry_size pe_size)
@@ -1067,7 +1140,7 @@ xfs_filemap_huge_fault(
(vmf->flags & FAULT_FLAG_WRITE));
}
-static int
+static vm_fault_t
xfs_filemap_page_mkwrite(
struct vm_fault *vmf)
{
@@ -1079,7 +1152,7 @@ xfs_filemap_page_mkwrite(
* on write faults. In reality, it needs to serialise against truncate and
* prepare memory for writing so handle is as standard write fault.
*/
-static int
+static vm_fault_t
xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 43cfc07996a4..0299febece9c 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -465,10 +465,9 @@ xfs_getfsmap_rtdev_rtbitmap_helper(
struct xfs_rmap_irec irec;
xfs_daddr_t rec_daddr;
- rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
-
- irec.rm_startblock = rec->ar_startblock;
- irec.rm_blockcount = rec->ar_blockcount;
+ irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize;
+ rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock);
+ irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize;
irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */
irec.rm_offset = 0;
irec.rm_flags = 0;
@@ -534,8 +533,11 @@ xfs_getfsmap_rtdev_rtbitmap_query(
xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
- alow.ar_startblock = info->low.rm_startblock;
- ahigh.ar_startblock = info->high.rm_startblock;
+ alow.ar_startext = info->low.rm_startblock;
+ ahigh.ar_startext = info->high.rm_startblock;
+ do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize);
+ if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize))
+ ahigh.ar_startext++;
error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
xfs_getfsmap_rtdev_rtbitmap_helper, info);
if (error)
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 523792768080..bc7ef18da243 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,85 +24,42 @@
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
-#include "xfs_da_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_inode.h"
#include "xfs_trans.h"
-#include "xfs_inode_item.h"
#include "xfs_error.h"
#include "xfs_btree.h"
-#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
-#include "xfs_rmap_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_fsops.h"
-#include "xfs_itable.h"
#include "xfs_trans_space.h"
#include "xfs_rtalloc.h"
#include "xfs_trace.h"
#include "xfs_log.h"
-#include "xfs_filestream.h"
-#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "xfs_ag_resv.h"
/*
- * File system operations
+ * growfs operations
*/
-
-static struct xfs_buf *
-xfs_growfs_get_hdr_buf(
- struct xfs_mount *mp,
- xfs_daddr_t blkno,
- size_t numblks,
- int flags,
- const struct xfs_buf_ops *ops)
-{
- struct xfs_buf *bp;
-
- bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
- if (!bp)
- return NULL;
-
- xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- bp->b_bn = blkno;
- bp->b_maps[0].bm_bn = blkno;
- bp->b_ops = ops;
-
- return bp;
-}
-
static int
xfs_growfs_data_private(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_data_t *in) /* growfs data input struct */
{
- xfs_agf_t *agf;
- struct xfs_agfl *agfl;
- xfs_agi_t *agi;
- xfs_agnumber_t agno;
- xfs_extlen_t agsize;
- xfs_extlen_t tmpsize;
- xfs_alloc_rec_t *arec;
xfs_buf_t *bp;
- int bucket;
- int dpct;
- int error, saved_error = 0;
+ int error;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
xfs_rfsblock_t nb, nb_mod;
xfs_rfsblock_t new;
- xfs_rfsblock_t nfree;
xfs_agnumber_t oagcount;
- int pct;
xfs_trans_t *tp;
+ LIST_HEAD (buffer_list);
+ struct aghdr_init_data id = {};
nb = in->newblocks;
- pct = in->imaxpct;
- if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
+ if (nb < mp->m_sb.sb_dblocks)
return -EINVAL;
if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
return error;
- dpct = pct - mp->m_sb.sb_imax_pct;
error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
@@ -135,376 +92,45 @@ xfs_growfs_data_private(
return error;
/*
- * Write new AG headers to disk. Non-transactional, but written
- * synchronously so they are completed prior to the growfs transaction
- * being logged.
+ * Write new AG headers to disk. Non-transactional, but need to be
+ * written and completed prior to the growfs transaction being logged.
+ * To do this, we use a delayed write buffer list and wait for
+ * submission and IO completion of the list as a whole. This allows the
+ * IO subsystem to merge all the AG headers in a single AG into a single
+ * IO and hide most of the latency of the IO from us.
+ *
+ * This also means that if we get an error whilst building the buffer
+ * list to write, we can cancel the entire list without having written
+ * anything.
*/
- nfree = 0;
- for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
- __be32 *agfl_bno;
-
- /*
- * AG freespace header block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0,
- &xfs_agf_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- agf = XFS_BUF_TO_AGF(bp);
- agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
- agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
- agf->agf_seqno = cpu_to_be32(agno);
- if (agno == nagcount - 1)
- agsize =
- nb -
- (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
+ INIT_LIST_HEAD(&id.buffer_list);
+ for (id.agno = nagcount - 1;
+ id.agno >= oagcount;
+ id.agno--, new -= id.agsize) {
+
+ if (id.agno == nagcount - 1)
+ id.agsize = nb -
+ (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
else
- agsize = mp->m_sb.sb_agblocks;
- agf->agf_length = cpu_to_be32(agsize);
- agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
- agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
- agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
- agf->agf_roots[XFS_BTNUM_RMAPi] =
- cpu_to_be32(XFS_RMAP_BLOCK(mp));
- agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
- agf->agf_rmap_blocks = cpu_to_be32(1);
- }
-
- agf->agf_flfirst = cpu_to_be32(1);
- agf->agf_fllast = 0;
- agf->agf_flcount = 0;
- tmpsize = agsize - mp->m_ag_prealloc_blocks;
- agf->agf_freeblks = cpu_to_be32(tmpsize);
- agf->agf_longest = cpu_to_be32(tmpsize);
- if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- agf->agf_refcount_root = cpu_to_be32(
- xfs_refc_block(mp));
- agf->agf_refcount_level = cpu_to_be32(1);
- agf->agf_refcount_blocks = cpu_to_be32(1);
- }
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /*
- * AG freelist header block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0,
- &xfs_agfl_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- agfl = XFS_BUF_TO_AGFL(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC);
- agfl->agfl_seqno = cpu_to_be32(agno);
- uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid);
- }
-
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
- for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++)
- agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /*
- * AG inode header block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0,
- &xfs_agi_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- agi = XFS_BUF_TO_AGI(bp);
- agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
- agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
- agi->agi_seqno = cpu_to_be32(agno);
- agi->agi_length = cpu_to_be32(agsize);
- agi->agi_count = 0;
- agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp));
- agi->agi_level = cpu_to_be32(1);
- agi->agi_freecount = 0;
- agi->agi_newino = cpu_to_be32(NULLAGINO);
- agi->agi_dirino = cpu_to_be32(NULLAGINO);
- if (xfs_sb_version_hascrc(&mp->m_sb))
- uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid);
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
- agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
- agi->agi_free_level = cpu_to_be32(1);
- }
- for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
- agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
+ id.agsize = mp->m_sb.sb_agblocks;
- /*
- * BNO btree root block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_allocbt_buf_ops);
-
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0);
-
- arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- arec->ar_blockcount = cpu_to_be32(
- agsize - be32_to_cpu(arec->ar_startblock));
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /*
- * CNT btree root block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_allocbt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0);
-
- arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
- arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
- arec->ar_blockcount = cpu_to_be32(
- agsize - be32_to_cpu(arec->ar_startblock));
- nfree += be32_to_cpu(arec->ar_blockcount);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /* RMAP btree root block */
- if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
- struct xfs_rmap_rec *rrec;
- struct xfs_btree_block *block;
-
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_rmapbt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0,
- agno, 0);
- block = XFS_BUF_TO_BLOCK(bp);
-
-
- /*
- * mark the AG header regions as static metadata The BNO
- * btree block is the first block after the headers, so
- * it's location defines the size of region the static
- * metadata consumes.
- *
- * Note: unlike mkfs, we never have to account for log
- * space when growing the data regions
- */
- rrec = XFS_RMAP_REC_ADDR(block, 1);
- rrec->rm_startblock = 0;
- rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp));
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
-
- /* account freespace btree root blocks */
- rrec = XFS_RMAP_REC_ADDR(block, 2);
- rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp));
- rrec->rm_blockcount = cpu_to_be32(2);
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
-
- /* account inode btree root blocks */
- rrec = XFS_RMAP_REC_ADDR(block, 3);
- rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp));
- rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) -
- XFS_IBT_BLOCK(mp));
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
-
- /* account for rmap btree root */
- rrec = XFS_RMAP_REC_ADDR(block, 4);
- rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp));
- rrec->rm_blockcount = cpu_to_be32(1);
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
-
- /* account for refc btree root */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- rrec = XFS_RMAP_REC_ADDR(block, 5);
- rrec->rm_startblock = cpu_to_be32(
- xfs_refc_block(mp));
- rrec->rm_blockcount = cpu_to_be32(1);
- rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
- rrec->rm_offset = 0;
- be16_add_cpu(&block->bb_numrecs, 1);
- }
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
- }
-
- /*
- * INO btree root block
- */
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_inobt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
-
- /*
- * FINO btree root block
- */
- if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_inobt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO,
- 0, 0, agno, 0);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
- }
-
- /*
- * refcount btree root block
- */
- if (xfs_sb_version_hasreflink(&mp->m_sb)) {
- bp = xfs_growfs_get_hdr_buf(mp,
- XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0,
- &xfs_refcountbt_buf_ops);
- if (!bp) {
- error = -ENOMEM;
- goto error0;
- }
-
- xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC,
- 0, 0, agno, 0);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error)
- goto error0;
- }
- }
- xfs_trans_agblocks_delta(tp, nfree);
- /*
- * There are new blocks in the old last a.g.
- */
- if (new) {
- struct xfs_owner_info oinfo;
-
- /*
- * Change the agi length.
- */
- error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
- if (error) {
- goto error0;
- }
- ASSERT(bp);
- agi = XFS_BUF_TO_AGI(bp);
- be32_add_cpu(&agi->agi_length, new);
- ASSERT(nagcount == oagcount ||
- be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks);
- xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH);
- /*
- * Change agf length.
- */
- error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp);
+ error = xfs_ag_init_headers(mp, &id);
if (error) {
- goto error0;
+ xfs_buf_delwri_cancel(&id.buffer_list);
+ goto out_trans_cancel;
}
- ASSERT(bp);
- agf = XFS_BUF_TO_AGF(bp);
- be32_add_cpu(&agf->agf_length, new);
- ASSERT(be32_to_cpu(agf->agf_length) ==
- be32_to_cpu(agi->agi_length));
+ }
+ error = xfs_buf_delwri_submit(&id.buffer_list);
+ if (error)
+ goto out_trans_cancel;
- xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH);
+ xfs_trans_agblocks_delta(tp, id.nfree);
- /*
- * Free the new space.
- *
- * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that
- * this doesn't actually exist in the rmap btree.
- */
- xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL);
- error = xfs_rmap_free(tp, bp, agno,
- be32_to_cpu(agf->agf_length) - new,
- new, &oinfo);
- if (error)
- goto error0;
- error = xfs_free_extent(tp,
- XFS_AGB_TO_FSB(mp, agno,
- be32_to_cpu(agf->agf_length) - new),
- new, &oinfo, XFS_AG_RESV_NONE);
+ /* If there are new blocks in the old last AG, extend it. */
+ if (new) {
+ error = xfs_ag_extend_space(mp, tp, &id, new);
if (error)
- goto error0;
+ goto out_trans_cancel;
}
/*
@@ -517,10 +143,8 @@ xfs_growfs_data_private(
if (nb > mp->m_sb.sb_dblocks)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
nb - mp->m_sb.sb_dblocks);
- if (nfree)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
- if (dpct)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+ if (id.nfree)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
xfs_trans_set_sync(tp);
error = xfs_trans_commit(tp);
if (error)
@@ -529,12 +153,6 @@ xfs_growfs_data_private(
/* New allocation groups fully initialized, so update mount struct */
if (nagimax)
mp->m_maxagi = nagimax;
- if (mp->m_sb.sb_imax_pct) {
- uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
- do_div(icount, 100);
- mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
- } else
- mp->m_maxicount = 0;
xfs_set_low_space_thresholds(mp);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
@@ -545,73 +163,24 @@ xfs_growfs_data_private(
if (new) {
struct xfs_perag *pag;
- pag = xfs_perag_get(mp, agno);
+ pag = xfs_perag_get(mp, id.agno);
error = xfs_ag_resv_free(pag);
xfs_perag_put(pag);
if (error)
- goto out;
+ return error;
}
- /* Reserve AG metadata blocks. */
+ /*
+ * Reserve AG metadata blocks. ENOSPC here does not mean there was a
+ * growfs failure, just that there still isn't space for new user data
+ * after the grow has been run.
+ */
error = xfs_fs_reserve_ag_blocks(mp);
- if (error && error != -ENOSPC)
- goto out;
-
- /* update secondary superblocks. */
- for (agno = 1; agno < nagcount; agno++) {
+ if (error == -ENOSPC)
error = 0;
- /*
- * new secondary superblocks need to be zeroed, not read from
- * disk as the contents of the new area we are growing into is
- * completely unknown.
- */
- if (agno < oagcount) {
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp,
- &xfs_sb_buf_ops);
- } else {
- bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
- if (bp) {
- bp->b_ops = &xfs_sb_buf_ops;
- xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
- } else
- error = -ENOMEM;
- }
-
- /*
- * If we get an error reading or writing alternate superblocks,
- * continue. xfs_repair chooses the "best" superblock based
- * on most matches; if we break early, we'll leave more
- * superblocks un-updated than updated, and xfs_repair may
- * pick them over the properly-updated primary.
- */
- if (error) {
- xfs_warn(mp,
- "error %d reading secondary superblock for ag %d",
- error, agno);
- saved_error = error;
- continue;
- }
- xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
-
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- if (error) {
- xfs_warn(mp,
- "write error %d updating secondary superblock for ag %d",
- error, agno);
- saved_error = error;
- continue;
- }
- }
-
- out:
- return saved_error ? saved_error : error;
+ return error;
- error0:
+out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
@@ -638,25 +207,71 @@ xfs_growfs_log_private(
return -ENOSYS;
}
+static int
+xfs_growfs_imaxpct(
+ struct xfs_mount *mp,
+ __u32 imaxpct)
+{
+ struct xfs_trans *tp;
+ int dpct;
+ int error;
+
+ if (imaxpct > 100)
+ return -EINVAL;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
+ XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ dpct = imaxpct - mp->m_sb.sb_imax_pct;
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp);
+}
+
/*
* protected versions of growfs function acquire and release locks on the mount
* point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
* XFS_IOC_FSGROWFSRT
*/
-
-
int
xfs_growfs_data(
- xfs_mount_t *mp,
- xfs_growfs_data_t *in)
+ struct xfs_mount *mp,
+ struct xfs_growfs_data *in)
{
- int error;
+ int error = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
- error = xfs_growfs_data_private(mp, in);
+
+ /* update imaxpct separately to the physical grow of the filesystem */
+ if (in->imaxpct != mp->m_sb.sb_imax_pct) {
+ error = xfs_growfs_imaxpct(mp, in->imaxpct);
+ if (error)
+ goto out_error;
+ }
+
+ if (in->newblocks != mp->m_sb.sb_dblocks) {
+ error = xfs_growfs_data_private(mp, in);
+ if (error)
+ goto out_error;
+ }
+
+ /* Post growfs calculations needed to reflect new state in operations */
+ if (mp->m_sb.sb_imax_pct) {
+ uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
+ do_div(icount, 100);
+ mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
+ } else
+ mp->m_maxicount = 0;
+
+ /* Update secondary superblocks now the physical grow has completed */
+ error = xfs_update_secondary_sbs(mp);
+
+out_error:
/*
* Increment the generation unconditionally, the error could be from
* updating the secondary superblocks, in which case the new size
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 3e1cc3001bcb..fdde17a2333c 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -47,6 +47,7 @@ xfs_param_t xfs_params = {
struct xfs_globals xfs_globals = {
.log_recovery_delay = 0, /* no delay by default */
+ .mount_delay = 0, /* no delay by default */
#ifdef XFS_ASSERT_FATAL
.bug_on_assert = true, /* assert failures BUG() */
#else
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9a18f69f6e96..164350d91efc 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -107,7 +107,8 @@ xfs_inode_free_callback(
xfs_idestroy_fork(ip, XFS_COW_FORK);
if (ip->i_itemp) {
- ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+ ASSERT(!test_bit(XFS_LI_IN_AIL,
+ &ip->i_itemp->ili_item.li_flags));
xfs_inode_item_destroy(ip);
ip->i_itemp = NULL;
}
@@ -309,6 +310,46 @@ xfs_reinit_inode(
}
/*
+ * If we are allocating a new inode, then check what was returned is
+ * actually a free, empty inode. If we are not allocating an inode,
+ * then check we didn't find a free inode.
+ *
+ * Returns:
+ * 0 if the inode free state matches the lookup context
+ * -ENOENT if the inode is free and we are not allocating
+ * -EFSCORRUPTED if there is any state mismatch at all
+ */
+static int
+xfs_iget_check_free_state(
+ struct xfs_inode *ip,
+ int flags)
+{
+ if (flags & XFS_IGET_CREATE) {
+ /* should be a free inode */
+ if (VFS_I(ip)->i_mode != 0) {
+ xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
+ ip->i_ino, VFS_I(ip)->i_mode);
+ return -EFSCORRUPTED;
+ }
+
+ if (ip->i_d.di_nblocks != 0) {
+ xfs_warn(ip->i_mount,
+"Corruption detected! Free inode 0x%llx has blocks allocated!",
+ ip->i_ino);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+ }
+
+ /* should be an allocated inode */
+ if (VFS_I(ip)->i_mode == 0)
+ return -ENOENT;
+
+ return 0;
+}
+
+/*
* Check the validity of the inode we just found it the cache
*/
static int
@@ -357,12 +398,12 @@ xfs_iget_cache_hit(
}
/*
- * If lookup is racing with unlink return an error immediately.
+ * Check the inode free state is valid. This also detects lookup
+ * racing with unlinks.
*/
- if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = -ENOENT;
+ error = xfs_iget_check_free_state(ip, flags);
+ if (error)
goto out_error;
- }
/*
* If IRECLAIMABLE is set, we've torn down the VFS inode already.
@@ -485,29 +526,12 @@ xfs_iget_cache_miss(
/*
- * If we are allocating a new inode, then check what was returned is
- * actually a free, empty inode. If we are not allocating an inode,
- * the check we didn't find a free inode.
+ * Check the inode free state is valid. This also detects lookup
+ * racing with unlinks.
*/
- if (flags & XFS_IGET_CREATE) {
- if (VFS_I(ip)->i_mode != 0) {
- xfs_warn(mp,
-"Corruption detected! Free inode 0x%llx not marked free on disk",
- ino);
- error = -EFSCORRUPTED;
- goto out_destroy;
- }
- if (ip->i_d.di_nblocks != 0) {
- xfs_warn(mp,
-"Corruption detected! Free inode 0x%llx has blocks allocated!",
- ino);
- error = -EFSCORRUPTED;
- goto out_destroy;
- }
- } else if (VFS_I(ip)->i_mode == 0) {
- error = -ENOENT;
+ error = xfs_iget_check_free_state(ip, flags);
+ if (error)
goto out_destroy;
- }
/*
* Preload the radix tree so we can insert safely under the
@@ -1802,3 +1826,21 @@ xfs_inode_clear_cowblocks_tag(
return __xfs_inode_clear_blocks_tag(ip,
trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG);
}
+
+/* Disable post-EOF and CoW block auto-reclamation. */
+void
+xfs_icache_disable_reclaim(
+ struct xfs_mount *mp)
+{
+ cancel_delayed_work_sync(&mp->m_eofblocks_work);
+ cancel_delayed_work_sync(&mp->m_cowblocks_work);
+}
+
+/* Enable post-EOF and CoW block auto-reclamation. */
+void
+xfs_icache_enable_reclaim(
+ struct xfs_mount *mp)
+{
+ xfs_queue_eofblocks(mp);
+ xfs_queue_cowblocks(mp);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index d4a77588eca1..d69a0f5a6a73 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -131,4 +131,7 @@ xfs_fs_eofblocks_from_user(
int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_ino_t ino, bool *inuse);
+void xfs_icache_disable_reclaim(struct xfs_mount *mp);
+void xfs_icache_enable_reclaim(struct xfs_mount *mp);
+
#endif
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 865ad1373e5e..5da9599156ed 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -91,7 +91,7 @@ xfs_icreate_item_unlock(
{
struct xfs_icreate_item *icp = ICR_ITEM(lip);
- if (icp->ic_item.li_flags & XFS_LI_ABORTED)
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
kmem_zone_free(xfs_icreate_zone, icp);
return;
}
@@ -184,5 +184,5 @@ xfs_icreate_log(
xfs_trans_add_item(tp, &icp->ic_item);
tp->t_flags |= XFS_TRANS_DIRTY;
- icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags);
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2b70c8b4cee2..05207a64dd53 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -498,7 +498,7 @@ again:
if (!try_lock) {
for (j = (i - 1); j >= 0 && !try_lock; j--) {
lp = (xfs_log_item_t *)ips[j]->i_itemp;
- if (lp && (lp->li_flags & XFS_LI_IN_AIL))
+ if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags))
try_lock++;
}
}
@@ -598,7 +598,7 @@ xfs_lock_two_inodes(
* and try again.
*/
lp = (xfs_log_item_t *)ip0->i_itemp;
- if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) {
if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) {
xfs_iunlock(ip0, ip0_mode);
if ((++attempts % 5) == 0)
@@ -791,6 +791,18 @@ xfs_ialloc(
ASSERT(*ialloc_context == NULL);
/*
+ * Protect against obviously corrupt allocation btree records. Later
+ * xfs_iget checks will catch re-allocation of other active in-memory
+ * and on-disk inodes. If we don't catch reallocating the parent inode
+ * here we will deadlock in xfs_iget() so we have to do these checks
+ * first.
+ */
+ if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
+ xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+ return -EFSCORRUPTED;
+ }
+
+ /*
* Get the in-core inode with the lock held exclusively.
* This is because we're setting fields here we need
* to prevent others from looking at until we're done.
@@ -1196,6 +1208,7 @@ xfs_create(
unlock_dp_on_error = true;
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
/*
* Reserve disk quota and the inode.
@@ -1411,11 +1424,11 @@ xfs_link(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- error = xfs_qm_dqattach(sip, 0);
+ error = xfs_qm_dqattach(sip);
if (error)
goto std_return;
- error = xfs_qm_dqattach(tdp, 0);
+ error = xfs_qm_dqattach(tdp);
if (error)
goto std_return;
@@ -1451,6 +1464,7 @@ xfs_link(
}
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
/*
* Handle initial link state of O_TMPFILE inode
@@ -1534,11 +1548,12 @@ xfs_itruncate_clear_reflink_flags(
* dirty on error so that transactions can be easily aborted if possible.
*/
int
-xfs_itruncate_extents(
+xfs_itruncate_extents_flags(
struct xfs_trans **tpp,
struct xfs_inode *ip,
int whichfork,
- xfs_fsize_t new_size)
+ xfs_fsize_t new_size,
+ int flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
@@ -1561,6 +1576,8 @@ xfs_itruncate_extents(
trace_xfs_itruncate_extents_start(ip, new_size);
+ flags |= xfs_bmapi_aflag(whichfork);
+
/*
* Since it is possible for space to become allocated beyond
* the end of the file (in a crash where the space is allocated
@@ -1579,12 +1596,9 @@ xfs_itruncate_extents(
unmap_len = last_block - first_unmap_block + 1;
while (!done) {
xfs_defer_init(&dfops, &first_block);
- error = xfs_bunmapi(tp, ip,
- first_unmap_block, unmap_len,
- xfs_bmapi_aflag(whichfork),
- XFS_ITRUNC_MAX_EXTENTS,
- &first_block, &dfops,
- &done);
+ error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
+ XFS_ITRUNC_MAX_EXTENTS, &first_block,
+ &dfops, &done);
if (error)
goto out_bmap_cancel;
@@ -1811,6 +1825,7 @@ xfs_inactive_ifree(
xfs_trans_ijoin(tp, ip, 0);
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
error = xfs_ifree(tp, ip, &dfops);
if (error) {
/*
@@ -1911,7 +1926,7 @@ xfs_inactive(
ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
truncate = 1;
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return;
@@ -2574,11 +2589,11 @@ xfs_remove(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- error = xfs_qm_dqattach(dp, 0);
+ error = xfs_qm_dqattach(dp);
if (error)
goto std_return;
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
goto std_return;
@@ -2647,6 +2662,7 @@ xfs_remove(
goto out_trans_cancel;
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
error = xfs_dir_removename(tp, dp, name, ip->i_ino,
&first_block, &dfops, resblks);
if (error) {
@@ -3014,6 +3030,7 @@ xfs_rename(
}
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
/* RENAME_EXCHANGE is unique from here on. */
if (flags & RENAME_EXCHANGE)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1eebc53df7d7..a91d9fb1effc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -379,6 +379,20 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
>> XFS_ILOCK_SHIFT)
/*
+ * Layouts are broken in the BREAK_WRITE case to ensure that
+ * layout-holders do not collide with local writes. Additionally,
+ * layouts are broken in the BREAK_UNMAP case to make sure the
+ * layout-holder has a consistent view of the file's extent map. While
+ * BREAK_WRITE breaks can be satisfied by recalling FL_LAYOUT leases,
+ * BREAK_UNMAP breaks additionally require waiting for busy dax-pages to
+ * go idle.
+ */
+enum layout_break_reason {
+ BREAK_WRITE,
+ BREAK_UNMAP,
+};
+
+/*
* For multiple groups support: if S_ISGID bit is set in the parent
* directory, group of new file is set to that of the parent, and
* new subdirectory gets S_ISGID bit from parent.
@@ -415,8 +429,8 @@ uint xfs_ilock_attr_map_shared(struct xfs_inode *);
uint xfs_ip2xflags(struct xfs_inode *);
int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
struct xfs_defer_ops *);
-int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
- int, xfs_fsize_t);
+int xfs_itruncate_extents_flags(struct xfs_trans **,
+ struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
void xfs_iunpin_wait(xfs_inode_t *);
@@ -433,6 +447,16 @@ int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
xfs_nlink_t, dev_t, prid_t,
struct xfs_inode **);
+static inline int
+xfs_itruncate_extents(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fsize_t new_size)
+{
+ return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0);
+}
+
/* from xfs_file.c */
enum xfs_prealloc_flags {
XFS_PREALLOC_SET = (1 << 1),
@@ -443,6 +467,8 @@ enum xfs_prealloc_flags {
int xfs_update_prealloc_flags(struct xfs_inode *ip,
enum xfs_prealloc_flags flags);
+int xfs_break_layouts(struct inode *inode, uint *iolock,
+ enum layout_break_reason reason);
/* from xfs_iops.c */
extern void xfs_setup_inode(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 34b91b789702..3e5b8574818e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -518,7 +518,7 @@ xfs_inode_item_push(
* The buffer containing this item failed to be written back
* previously. Resubmit the buffer for IO.
*/
- if (lip->li_flags & XFS_LI_FAILED) {
+ if (test_bit(XFS_LI_FAILED, &lip->li_flags)) {
if (!xfs_buf_trylock(bp))
return XFS_ITEM_LOCKED;
@@ -729,14 +729,14 @@ xfs_iflush_done(
*/
iip = INODE_ITEM(blip);
if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
- (blip->li_flags & XFS_LI_FAILED))
+ test_bit(XFS_LI_FAILED, &blip->li_flags))
need_ail++;
}
/* make sure we capture the state of the initial inode. */
iip = INODE_ITEM(lip);
if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
- lip->li_flags & XFS_LI_FAILED)
+ test_bit(XFS_LI_FAILED, &lip->li_flags))
need_ail++;
/*
@@ -803,7 +803,7 @@ xfs_iflush_abort(
xfs_inode_log_item_t *iip = ip->i_itemp;
if (iip) {
- if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
+ if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) {
xfs_trans_ail_remove(&iip->ili_item,
stale ? SHUTDOWN_LOG_IO_ERROR :
SHUTDOWN_CORRUPT_INCORE);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 89fb1eb80aae..32b680522abd 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -39,7 +39,6 @@
#include "xfs_icache.h"
#include "xfs_symlink.h"
#include "xfs_trans.h"
-#include "xfs_pnfs.h"
#include "xfs_acl.h"
#include "xfs_btree.h"
#include <linux/fsmap.h>
@@ -614,7 +613,7 @@ xfs_ioc_space(
struct xfs_inode *ip = XFS_I(inode);
struct iattr iattr;
enum xfs_prealloc_flags flags = 0;
- uint iolock = XFS_IOLOCK_EXCL;
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
int error;
/*
@@ -644,13 +643,10 @@ xfs_ioc_space(
return error;
xfs_ilock(ip, iolock);
- error = xfs_break_layouts(inode, &iolock);
+ error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
goto out_unlock;
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- iolock |= XFS_MMAPLOCK_EXCL;
-
switch (bf->l_whence) {
case 0: /*SEEK_SET*/
break;
@@ -1103,7 +1099,8 @@ xfs_ioctl_setattr_dax_invalidate(
if (fa->fsx_xflags & FS_XFLAG_DAX) {
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
return -EINVAL;
- if (bdev_dax_supported(sb, sb->s_blocksize) < 0)
+ if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)),
+ sb->s_blocksize))
return -EINVAL;
}
@@ -1811,6 +1808,88 @@ xfs_ioc_swapext(
return error;
}
+static int
+xfs_ioc_getlabel(
+ struct xfs_mount *mp,
+ char __user *user_label)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ char label[XFSLABEL_MAX + 1];
+
+ /* Paranoia */
+ BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX);
+
+ spin_lock(&mp->m_sb_lock);
+ strncpy(label, sbp->sb_fname, sizeof(sbp->sb_fname));
+ spin_unlock(&mp->m_sb_lock);
+
+ /* xfs on-disk label is 12 chars, be sure we send a null to user */
+ label[XFSLABEL_MAX] = '\0';
+ if (copy_to_user(user_label, label, sizeof(sbp->sb_fname)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+xfs_ioc_setlabel(
+ struct file *filp,
+ struct xfs_mount *mp,
+ char __user *newlabel)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ char label[XFSLABEL_MAX + 1];
+ size_t len;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ /*
+ * The generic ioctl allows up to FSLABEL_MAX chars, but XFS is much
+ * smaller, at 12 bytes. We copy one more to be sure we find the
+ * (required) NULL character to test the incoming label length.
+ * NB: The on disk label doesn't need to be null terminated.
+ */
+ if (copy_from_user(label, newlabel, XFSLABEL_MAX + 1))
+ return -EFAULT;
+ len = strnlen(label, XFSLABEL_MAX + 1);
+ if (len > sizeof(sbp->sb_fname))
+ return -EINVAL;
+
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+
+ spin_lock(&mp->m_sb_lock);
+ memset(sbp->sb_fname, 0, sizeof(sbp->sb_fname));
+ strncpy(sbp->sb_fname, label, sizeof(sbp->sb_fname));
+ spin_unlock(&mp->m_sb_lock);
+
+ /*
+ * Now we do several things to satisfy userspace.
+ * In addition to normal logging of the primary superblock, we also
+ * immediately write these changes to sector zero for the primary, then
+ * update all backup supers (as xfs_db does for a label change), then
+ * invalidate the block device page cache. This is so that any prior
+ * buffered reads from userspace (i.e. from blkid) are invalidated,
+ * and userspace will see the newly-written label.
+ */
+ error = xfs_sync_sb_buf(mp);
+ if (error)
+ goto out;
+ /*
+ * growfs also updates backup supers so lock against that.
+ */
+ mutex_lock(&mp->m_growlock);
+ error = xfs_update_secondary_sbs(mp);
+ mutex_unlock(&mp->m_growlock);
+
+ invalidate_bdev(mp->m_ddev_targp->bt_bdev);
+
+out:
+ mnt_drop_write_file(filp);
+ return error;
+}
+
/*
* Note: some of the ioctl's return positive numbers as a
* byte count indicating success, such as readlink_by_handle.
@@ -1834,6 +1913,10 @@ xfs_file_ioctl(
switch (cmd) {
case FITRIM:
return xfs_ioc_trim(mp, arg);
+ case FS_IOC_GETFSLABEL:
+ return xfs_ioc_getlabel(mp, arg);
+ case FS_IOC_SETFSLABEL:
+ return xfs_ioc_setlabel(filp, mp, arg);
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_RESVSP:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 046469fcc1b8..c6ce6f9335b6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -224,7 +224,7 @@ xfs_iomap_write_direct(
* necessary and move on to transaction setup.
*/
xfs_iunlock(ip, lockmode);
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -576,7 +576,7 @@ xfs_file_iomap_begin_delay(
goto done;
}
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
@@ -692,7 +692,7 @@ xfs_iomap_write_allocate(
/*
* Make sure that the dquots are there.
*/
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -946,8 +946,11 @@ error_on_bmapi_transaction:
return error;
}
-static inline bool imap_needs_alloc(struct inode *inode,
- struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool
+imap_needs_alloc(
+ struct inode *inode,
+ struct xfs_bmbt_irec *imap,
+ int nimaps)
{
return !nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
@@ -955,31 +958,58 @@ static inline bool imap_needs_alloc(struct inode *inode,
(IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
}
-static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps)
+static inline bool
+needs_cow_for_zeroing(
+ struct xfs_bmbt_irec *imap,
+ int nimaps)
{
return nimaps &&
imap->br_startblock != HOLESTARTBLOCK &&
imap->br_state != XFS_EXT_UNWRITTEN;
}
-static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
+static int
+xfs_ilock_for_iomap(
+ struct xfs_inode *ip,
+ unsigned flags,
+ unsigned *lockmode)
{
+ unsigned mode = XFS_ILOCK_SHARED;
+
/*
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
- if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
- return true;
+ if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) {
+ /*
+ * FIXME: It could still overwrite on unshared extents and not
+ * need allocation.
+ */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+ mode = XFS_ILOCK_EXCL;
+ }
/*
- * Extents not yet cached requires exclusive access, don't block.
- * This is an opencoded xfs_ilock_data_map_shared() to cater for the
+ * Extents not yet cached requires exclusive access, don't block. This
+ * is an opencoded xfs_ilock_data_map_shared() call but with
* non-blocking behaviour.
*/
- if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
- !(ip->i_df.if_flags & XFS_IFEXTENTS))
- return true;
- return false;
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+ mode = XFS_ILOCK_EXCL;
+ }
+
+ if (flags & IOMAP_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, mode))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, mode);
+ }
+
+ *lockmode = mode;
+ return 0;
}
static int
@@ -1007,19 +1037,15 @@ xfs_file_iomap_begin(
return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
}
- if (need_excl_ilock(ip, flags))
- lockmode = XFS_ILOCK_EXCL;
- else
- lockmode = XFS_ILOCK_SHARED;
-
- if (flags & IOMAP_NOWAIT) {
- if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
- return -EAGAIN;
- if (!xfs_ilock_nowait(ip, lockmode))
- return -EAGAIN;
- } else {
- xfs_ilock(ip, lockmode);
- }
+ /*
+ * Lock the inode in the manner required for the specified operation and
+ * check for as many conditions that would result in blocking as
+ * possible. This removes most of the non-blocking checks from the
+ * mapping code below.
+ */
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
ASSERT(offset <= mp->m_super->s_maxbytes);
if (offset > mp->m_super->s_maxbytes - length)
@@ -1040,19 +1066,21 @@ xfs_file_iomap_begin(
goto out_unlock;
}
- if (xfs_is_reflink_inode(ip) &&
- ((flags & IOMAP_WRITE) ||
- ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) {
+ /* Non-modifying mapping requested, so we are done */
+ if (!(flags & (IOMAP_WRITE | IOMAP_ZERO)))
+ goto out_found;
+
+ /*
+ * Break shared extents if necessary. Checks for non-blocking IO have
+ * been done up front, so we don't need to do them here.
+ */
+ if (xfs_is_reflink_inode(ip)) {
+ /* if zeroing doesn't need COW allocation, then we are done. */
+ if ((flags & IOMAP_ZERO) &&
+ !needs_cow_for_zeroing(&imap, nimaps))
+ goto out_found;
+
if (flags & IOMAP_DIRECT) {
- /*
- * A reflinked inode will result in CoW alloc.
- * FIXME: It could still overwrite on unshared extents
- * and not need allocation.
- */
- if (flags & IOMAP_NOWAIT) {
- error = -EAGAIN;
- goto out_unlock;
- }
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &shared,
&lockmode);
@@ -1068,46 +1096,45 @@ xfs_file_iomap_begin(
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) {
- /*
- * If nowait is set bail since we are going to make
- * allocations.
- */
- if (flags & IOMAP_NOWAIT) {
- error = -EAGAIN;
- goto out_unlock;
- }
- /*
- * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
- * pages to keep the chunks of work done where somewhat symmetric
- * with the work writeback does. This is a completely arbitrary
- * number pulled out of thin air as a best guess for initial
- * testing.
- *
- * Note that the values needs to be less than 32-bits wide until
- * the lower level functions are updated.
- */
- length = min_t(loff_t, length, 1024 * PAGE_SIZE);
- /*
- * xfs_iomap_write_direct() expects the shared lock. It
- * is unlocked on return.
- */
- if (lockmode == XFS_ILOCK_EXCL)
- xfs_ilock_demote(ip, lockmode);
- error = xfs_iomap_write_direct(ip, offset, length, &imap,
- nimaps);
- if (error)
- return error;
+ /* Don't need to allocate over holes when doing zeroing operations. */
+ if (flags & IOMAP_ZERO)
+ goto out_found;
- iomap->flags = IOMAP_F_NEW;
- trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
- } else {
- ASSERT(nimaps);
+ if (!imap_needs_alloc(inode, &imap, nimaps))
+ goto out_found;
- xfs_iunlock(ip, lockmode);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ /* If nowait is set bail since we are going to make allocations. */
+ if (flags & IOMAP_NOWAIT) {
+ error = -EAGAIN;
+ goto out_unlock;
}
+ /*
+ * We cap the maximum length we map to a sane size to keep the chunks
+ * of work done where somewhat symmetric with the work writeback does.
+ * This is a completely arbitrary number pulled out of thin air as a
+ * best guess for initial testing.
+ *
+ * Note that the values needs to be less than 32-bits wide until the
+ * lower level functions are updated.
+ */
+ length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+
+ /*
+ * xfs_iomap_write_direct() expects the shared lock. It is unlocked on
+ * return.
+ */
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+ error = xfs_iomap_write_direct(ip, offset, length, &imap,
+ nimaps);
+ if (error)
+ return error;
+
+ iomap->flags = IOMAP_F_NEW;
+ trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
+
+out_finish:
if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
& ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
@@ -1117,6 +1144,13 @@ xfs_file_iomap_begin(
if (shared)
iomap->flags |= IOMAP_F_SHARED;
return 0;
+
+out_found:
+ ASSERT(nimaps);
+ xfs_iunlock(ip, lockmode);
+ trace_xfs_iomap_found(ip, offset, length, 0, &imap);
+ goto out_finish;
+
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index df42e4cb4dc4..3b4be06fdaa5 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -37,7 +37,6 @@
#include "xfs_da_btree.h"
#include "xfs_dir2.h"
#include "xfs_trans_space.h"
-#include "xfs_pnfs.h"
#include "xfs_iomap.h"
#include <linux/capability.h>
@@ -855,7 +854,7 @@ xfs_setattr_size(
/*
* Make sure that the dquots are attached to the inode.
*/
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
@@ -1030,14 +1029,19 @@ xfs_vn_setattr(
int error;
if (iattr->ia_valid & ATTR_SIZE) {
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
- uint iolock = XFS_IOLOCK_EXCL;
+ struct inode *inode = d_inode(dentry);
+ struct xfs_inode *ip = XFS_I(inode);
+ uint iolock;
- error = xfs_break_layouts(d_inode(dentry), &iolock);
- if (error)
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+
+ error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
+ if (error) {
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
+ }
- xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
error = xfs_vn_setattr_size(dentry, iattr);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
} else {
@@ -1195,6 +1199,30 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = {
.update_time = xfs_vn_update_time,
};
+/* Figure out if this file actually supports DAX. */
+static bool
+xfs_inode_supports_dax(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /* Only supported on non-reflinked files. */
+ if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip))
+ return false;
+
+ /* DAX mount option or DAX iflag must be set. */
+ if (!(mp->m_flags & XFS_MOUNT_DAX) &&
+ !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+ return false;
+
+ /* Block size must match page size */
+ if (mp->m_sb.sb_blocksize != PAGE_SIZE)
+ return false;
+
+ /* Device has to support DAX too. */
+ return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL;
+}
+
STATIC void
xfs_diflags_to_iflags(
struct inode *inode,
@@ -1213,11 +1241,7 @@ xfs_diflags_to_iflags(
inode->i_flags |= S_SYNC;
if (flags & XFS_DIFLAG_NOATIME)
inode->i_flags |= S_NOATIME;
- if (S_ISREG(inode->i_mode) &&
- ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
- !xfs_is_reflink_inode(ip) &&
- (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
- ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
+ if (xfs_inode_supports_dax(ip))
inode->i_flags |= S_DAX;
}
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2fcd9ed5d075..c21039f27e39 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1047,6 +1047,7 @@ xfs_log_item_init(
INIT_LIST_HEAD(&item->li_ail);
INIT_LIST_HEAD(&item->li_cil);
INIT_LIST_HEAD(&item->li_bio_list);
+ INIT_LIST_HEAD(&item->li_trans);
}
/*
@@ -2110,10 +2111,10 @@ xlog_print_tic_res(
*/
void
xlog_print_trans(
- struct xfs_trans *tp)
+ struct xfs_trans *tp)
{
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_log_item_desc *lidp;
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_log_item *lip;
/* dump core transaction and ticket info */
xfs_warn(mp, "transaction summary:");
@@ -2124,15 +2125,14 @@ xlog_print_trans(
xlog_print_tic_res(mp, tp->t_ticket);
/* dump each log item */
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
struct xfs_log_vec *lv = lip->li_lv;
struct xfs_log_iovec *vec;
int i;
xfs_warn(mp, "log item: ");
xfs_warn(mp, " type = 0x%x", lip->li_type);
- xfs_warn(mp, " flags = 0x%x", lip->li_flags);
+ xfs_warn(mp, " flags = 0x%lx", lip->li_flags);
if (!lv)
continue;
xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4668403b1741..c15687724728 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -141,10 +141,9 @@ xlog_cil_alloc_shadow_bufs(
struct xlog *log,
struct xfs_trans *tp)
{
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
struct xfs_log_vec *lv;
int niovecs = 0;
int nbytes = 0;
@@ -152,7 +151,7 @@ xlog_cil_alloc_shadow_bufs(
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
continue;
/* get number of vecs and size of data to be stored */
@@ -317,7 +316,7 @@ xlog_cil_insert_format_items(
int *diff_len,
int *diff_iovecs)
{
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
/* Bail out if we didn't find a log item. */
@@ -326,15 +325,14 @@ xlog_cil_insert_format_items(
return;
}
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
struct xfs_log_vec *lv;
struct xfs_log_vec *old_lv = NULL;
struct xfs_log_vec *shadow;
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
continue;
/*
@@ -406,7 +404,7 @@ xlog_cil_insert_items(
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx = cil->xc_ctx;
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
int len = 0;
int diff_iovecs = 0;
int iclog_space;
@@ -479,11 +477,10 @@ xlog_cil_insert_items(
* We do this here so we only need to take the CIL lock once during
* the transaction commit.
*/
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
/* Skip items which aren't dirty in this transaction. */
- if (!(lidp->lid_flags & XFS_LID_DIRTY))
+ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
continue;
/*
@@ -1013,6 +1010,7 @@ xfs_log_commit_cil(
*commit_lsn = xc_commit_lsn;
xfs_log_done(mp, tp->t_ticket, NULL, regrant);
+ tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 2b2383f1895e..06a09cb948b5 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2702,7 +2702,7 @@ xlog_recover_do_reg_buffer(
goto next;
}
fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr,
- -1, 0, 0);
+ -1, 0);
if (fa) {
xfs_alert(mp,
"dquot corrupt at %pS trying to replay into block 0x%llx",
@@ -3348,7 +3348,7 @@ xlog_recover_dquot_pass2(
*/
dq_f = item->ri_buf[0].i_addr;
ASSERT(dq_f);
- fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0, 0);
+ fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
dq_f->qlf_id, fa);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index a901b86772f8..73ed8fec0328 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1072,9 +1072,7 @@ xfs_unmountfs(
uint64_t resblks;
int error;
- cancel_delayed_work_sync(&mp->m_eofblocks_work);
- cancel_delayed_work_sync(&mp->m_cowblocks_work);
-
+ xfs_icache_disable_reclaim(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index aa6c5c193f45..f44c3599527d 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,19 +31,20 @@
* rules in the page fault path we don't bother.
*/
int
-xfs_break_layouts(
+xfs_break_leased_layouts(
struct inode *inode,
- uint *iolock)
+ uint *iolock,
+ bool *did_unlock)
{
struct xfs_inode *ip = XFS_I(inode);
int error;
- ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
-
while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
xfs_iunlock(ip, *iolock);
+ *did_unlock = true;
error = break_layout(inode, true);
- *iolock = XFS_IOLOCK_EXCL;
+ *iolock &= ~XFS_IOLOCK_SHARED;
+ *iolock |= XFS_IOLOCK_EXCL;
xfs_ilock(ip, *iolock);
}
@@ -120,8 +121,8 @@ xfs_fs_map_blocks(
* Lock out any other I/O before we flush and invalidate the pagecache,
* and then hand out a layout to the remote system. This is very
* similar to direct I/O, except that the synchronization is much more
- * complicated. See the comment near xfs_break_layouts for a detailed
- * explanation.
+ * complicated. See the comment near xfs_break_leased_layouts
+ * for a detailed explanation.
*/
xfs_ilock(ip, XFS_IOLOCK_EXCL);
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index bf45951e28fe..940c6c2ad88c 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -9,10 +9,11 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
struct iattr *iattr);
-int xfs_break_layouts(struct inode *inode, uint *iolock);
+int xfs_break_leased_layouts(struct inode *inode, uint *iolock,
+ bool *did_unlock);
#else
static inline int
-xfs_break_layouts(struct inode *inode, uint *iolock)
+xfs_break_leased_layouts(struct inode *inode, uint *iolock, bool *did_unlock)
{
return 0;
}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index ec39ae274c78..c3e014bfc848 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -161,10 +161,7 @@ xfs_qm_dqpurge(
* to purge this dquot anyway, so we go ahead regardless.
*/
error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(mp, "%s: dquot "PTR_FMT" flush failed",
- __func__, dqp);
- } else {
+ if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
}
@@ -173,7 +170,7 @@ xfs_qm_dqpurge(
ASSERT(atomic_read(&dqp->q_pincount) == 0);
ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
- !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+ !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags));
xfs_dqfunlock(dqp);
xfs_dqunlock(dqp);
@@ -265,7 +262,7 @@ xfs_qm_dqattach_one(
xfs_inode_t *ip,
xfs_dqid_t id,
uint type,
- uint doalloc,
+ bool doalloc,
xfs_dquot_t **IO_idqpp)
{
xfs_dquot_t *dqp;
@@ -291,7 +288,7 @@ xfs_qm_dqattach_one(
* exist on disk and we didn't ask it to allocate; ESRCH if quotas got
* turned off suddenly.
*/
- error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc, &dqp);
+ error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp);
if (error)
return error;
@@ -326,14 +323,14 @@ xfs_qm_need_dqattach(
/*
* Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
* into account.
- * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * If @doalloc is true, the dquot(s) will be allocated if needed.
* Inode may get unlocked and relocked in here, and the caller must deal with
* the consequences.
*/
int
xfs_qm_dqattach_locked(
xfs_inode_t *ip,
- uint flags)
+ bool doalloc)
{
xfs_mount_t *mp = ip->i_mount;
int error = 0;
@@ -345,8 +342,7 @@ xfs_qm_dqattach_locked(
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
- flags & XFS_QMOPT_DQALLOC,
- &ip->i_udquot);
+ doalloc, &ip->i_udquot);
if (error)
goto done;
ASSERT(ip->i_udquot);
@@ -354,8 +350,7 @@ xfs_qm_dqattach_locked(
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
- flags & XFS_QMOPT_DQALLOC,
- &ip->i_gdquot);
+ doalloc, &ip->i_gdquot);
if (error)
goto done;
ASSERT(ip->i_gdquot);
@@ -363,8 +358,7 @@ xfs_qm_dqattach_locked(
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
- flags & XFS_QMOPT_DQALLOC,
- &ip->i_pdquot);
+ doalloc, &ip->i_pdquot);
if (error)
goto done;
ASSERT(ip->i_pdquot);
@@ -381,8 +375,7 @@ done:
int
xfs_qm_dqattach(
- struct xfs_inode *ip,
- uint flags)
+ struct xfs_inode *ip)
{
int error;
@@ -390,7 +383,7 @@ xfs_qm_dqattach(
return 0;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_qm_dqattach_locked(ip, flags);
+ error = xfs_qm_dqattach_locked(ip, false);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -479,11 +472,8 @@ xfs_qm_dquot_isolate(
spin_unlock(lru_lock);
error = xfs_qm_dqflush(dqp, &bp);
- if (error) {
- xfs_warn(dqp->q_mount, "%s: dquot "PTR_FMT" flush failed",
- __func__, dqp);
+ if (error)
goto out_unlock_dirty;
- }
xfs_buf_delwri_queue(bp, &isol->buffers);
xfs_buf_relse(bp);
@@ -571,27 +561,88 @@ xfs_qm_set_defquota(
{
xfs_dquot_t *dqp;
struct xfs_def_quota *defq;
+ struct xfs_disk_dquot *ddqp;
int error;
- error = xfs_qm_dqread(mp, 0, type, 0, &dqp);
+ error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
+ if (error)
+ return;
- if (!error) {
- xfs_disk_dquot_t *ddqp = &dqp->q_core;
+ ddqp = &dqp->q_core;
+ defq = xfs_get_defquota(dqp, qinf);
- defq = xfs_get_defquota(dqp, qinf);
+ /*
+ * Timers and warnings have been already set, let's just set the
+ * default limits for this quota type
+ */
+ defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+ defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+ defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+ defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+ defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+ defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+ xfs_qm_dqdestroy(dqp);
+}
- /*
- * Timers and warnings have been already set, let's just set the
- * default limits for this quota type
- */
- defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
- defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
- defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
- defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
- defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
- defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
- xfs_qm_dqdestroy(dqp);
- }
+/* Initialize quota time limits from the root dquot. */
+static void
+xfs_qm_init_timelimits(
+ struct xfs_mount *mp,
+ struct xfs_quotainfo *qinf)
+{
+ struct xfs_disk_dquot *ddqp;
+ struct xfs_dquot *dqp;
+ uint type;
+ int error;
+
+ qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+ qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+ qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+ qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+ qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+ qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+
+ /*
+ * We try to get the limits from the superuser's limits fields.
+ * This is quite hacky, but it is standard quota practice.
+ *
+ * Since we may not have done a quotacheck by this point, just read
+ * the dquot without attaching it to any hashtables or lists.
+ *
+ * Timers and warnings are globally set by the first timer found in
+ * user/group/proj quota types, otherwise a default value is used.
+ * This should be split into different fields per quota type.
+ */
+ if (XFS_IS_UQUOTA_RUNNING(mp))
+ type = XFS_DQ_USER;
+ else if (XFS_IS_GQUOTA_RUNNING(mp))
+ type = XFS_DQ_GROUP;
+ else
+ type = XFS_DQ_PROJ;
+ error = xfs_qm_dqget_uncached(mp, 0, type, &dqp);
+ if (error)
+ return;
+
+ ddqp = &dqp->q_core;
+ /*
+ * The warnings and timers set the grace period given to
+ * a user or group before he or she can not perform any
+ * more writing. If it is zero, a default is used.
+ */
+ if (ddqp->d_btimer)
+ qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer);
+ if (ddqp->d_itimer)
+ qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer);
+ if (ddqp->d_rtbtimer)
+ qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer);
+ if (ddqp->d_bwarns)
+ qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns);
+ if (ddqp->d_iwarns)
+ qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns);
+ if (ddqp->d_rtbwarns)
+ qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns);
+
+ xfs_qm_dqdestroy(dqp);
}
/*
@@ -600,11 +651,10 @@ xfs_qm_set_defquota(
*/
STATIC int
xfs_qm_init_quotainfo(
- xfs_mount_t *mp)
+ struct xfs_mount *mp)
{
- xfs_quotainfo_t *qinf;
- int error;
- xfs_dquot_t *dqp;
+ struct xfs_quotainfo *qinf;
+ int error;
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
@@ -636,52 +686,7 @@ xfs_qm_init_quotainfo(
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
- /*
- * We try to get the limits from the superuser's limits fields.
- * This is quite hacky, but it is standard quota practice.
- *
- * Since we may not have done a quotacheck by this point, just read
- * the dquot without attaching it to any hashtables or lists.
- *
- * Timers and warnings are globally set by the first timer found in
- * user/group/proj quota types, otherwise a default value is used.
- * This should be split into different fields per quota type.
- */
- error = xfs_qm_dqread(mp, 0,
- XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER :
- (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
- XFS_DQ_PROJ),
- 0, &dqp);
-
- if (!error) {
- xfs_disk_dquot_t *ddqp = &dqp->q_core;
-
- /*
- * The warnings and timers set the grace period given to
- * a user or group before he or she can not perform any
- * more writing. If it is zero, a default is used.
- */
- qinf->qi_btimelimit = ddqp->d_btimer ?
- be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
- qinf->qi_itimelimit = ddqp->d_itimer ?
- be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
- qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
- be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
- qinf->qi_bwarnlimit = ddqp->d_bwarns ?
- be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
- qinf->qi_iwarnlimit = ddqp->d_iwarns ?
- be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
- qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
- be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
- xfs_qm_dqdestroy(dqp);
- } else {
- qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
- qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
- qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
- qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
- qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
- qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
- }
+ xfs_qm_init_timelimits(mp, qinf);
if (XFS_IS_UQUOTA_RUNNING(mp))
xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf);
@@ -865,9 +870,9 @@ xfs_qm_reset_dqcounts(
* find uninitialised dquot blks. See comment in
* xfs_dquot_verify.
*/
- fa = xfs_dquot_verify(mp, ddq, id + j, type, 0);
+ fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type);
if (fa)
- xfs_dquot_repair(mp, ddq, id + j, type);
+ xfs_dqblk_repair(mp, &dqb[j], id + j, type);
/*
* Reset type in case we are reusing group quota file for
@@ -893,7 +898,7 @@ xfs_qm_reset_dqcounts(
}
STATIC int
-xfs_qm_dqiter_bufs(
+xfs_qm_reset_dqcounts_all(
struct xfs_mount *mp,
xfs_dqid_t firstid,
xfs_fsblock_t bno,
@@ -961,11 +966,11 @@ xfs_qm_dqiter_bufs(
}
/*
- * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
- * caller supplied function for every chunk of dquots that we find.
+ * Iterate over all allocated dquot blocks in this quota inode, zeroing all
+ * counters for every chunk of dquots that we find.
*/
STATIC int
-xfs_qm_dqiterate(
+xfs_qm_reset_dqcounts_buf(
struct xfs_mount *mp,
struct xfs_inode *qip,
uint flags,
@@ -1041,7 +1046,7 @@ xfs_qm_dqiterate(
* Iterate thru all the blks in the extent and
* reset the counters of all the dquots inside them.
*/
- error = xfs_qm_dqiter_bufs(mp, firstid,
+ error = xfs_qm_reset_dqcounts_all(mp, firstid,
map[i].br_startblock,
map[i].br_blockcount,
flags, buffer_list);
@@ -1066,16 +1071,17 @@ out:
STATIC int
xfs_qm_quotacheck_dqadjust(
struct xfs_inode *ip,
- xfs_dqid_t id,
uint type,
xfs_qcnt_t nblks,
xfs_qcnt_t rtblks)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_dquot *dqp;
+ xfs_dqid_t id;
int error;
- error = xfs_qm_dqget(mp, ip, id, type, XFS_QMOPT_DQALLOC, &dqp);
+ id = xfs_qm_id_for_quotatype(ip, type);
+ error = xfs_qm_dqget(mp, id, type, true, &dqp);
if (error) {
/*
* Shouldn't be able to turn off quotas here.
@@ -1148,13 +1154,10 @@ xfs_qm_dqusage_adjust(
}
/*
- * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
- * interface expects the inode to be exclusively locked because that's
- * the case in all other instances. It's OK that we do this because
- * quotacheck is done only at mount time.
+ * We don't _need_ to take the ilock EXCL here because quotacheck runs
+ * at mount time and therefore nobody will be racing chown/chproj.
*/
- error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL,
- &ip);
+ error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip);
if (error) {
*res = BULKSTAT_RV_NOTHING;
return error;
@@ -1189,33 +1192,31 @@ xfs_qm_dqusage_adjust(
* and quotaoffs don't race. (Quotachecks happen at mount time only).
*/
if (XFS_IS_UQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
- XFS_DQ_USER, nblks, rtblks);
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks,
+ rtblks);
if (error)
goto error0;
}
if (XFS_IS_GQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
- XFS_DQ_GROUP, nblks, rtblks);
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks,
+ rtblks);
if (error)
goto error0;
}
if (XFS_IS_PQUOTA_ON(mp)) {
- error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
- XFS_DQ_PROJ, nblks, rtblks);
+ error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks,
+ rtblks);
if (error)
goto error0;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
IRELE(ip);
*res = BULKSTAT_RV_DIDONE;
return 0;
error0:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
IRELE(ip);
*res = BULKSTAT_RV_GIVEUP;
return error;
@@ -1247,9 +1248,8 @@ xfs_qm_flush_one(
*/
if (!xfs_dqflock_nowait(dqp)) {
/* buf is pinned in-core by delwri list */
- DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen);
- bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL);
+ bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0);
if (!bp) {
error = -EINVAL;
goto out_unlock;
@@ -1307,7 +1307,7 @@ xfs_qm_quotacheck(
* We don't log our changes till later.
*/
if (uip) {
- error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA,
&buffer_list);
if (error)
goto error_return;
@@ -1315,7 +1315,7 @@ xfs_qm_quotacheck(
}
if (gip) {
- error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA,
&buffer_list);
if (error)
goto error_return;
@@ -1323,7 +1323,7 @@ xfs_qm_quotacheck(
}
if (pip) {
- error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA,
+ error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA,
&buffer_list);
if (error)
goto error_return;
@@ -1675,7 +1675,7 @@ xfs_qm_vop_dqalloc(
* if necessary. The dquot(s) will not be locked.
*/
if (XFS_NOT_DQATTACHED(mp, ip)) {
- error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
+ error = xfs_qm_dqattach_locked(ip, true);
if (error) {
xfs_iunlock(ip, lockflags);
return error;
@@ -1694,10 +1694,7 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, NULL, uid,
- XFS_DQ_USER,
- XFS_QMOPT_DQALLOC,
- &uq);
+ error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq);
if (error) {
ASSERT(error != -ENOENT);
return error;
@@ -1720,10 +1717,7 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
if (ip->i_d.di_gid != gid) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, NULL, gid,
- XFS_DQ_GROUP,
- XFS_QMOPT_DQALLOC,
- &gq);
+ error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1739,10 +1733,8 @@ xfs_qm_vop_dqalloc(
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
if (xfs_get_projid(ip) != prid) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
- XFS_DQ_PROJ,
- XFS_QMOPT_DQALLOC,
- &pq);
+ error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
+ true, &pq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1933,7 +1925,7 @@ xfs_qm_vop_rename_dqattach(
*/
if (i == 0 || ip != i_tab[i-1]) {
if (XFS_NOT_DQATTACHED(mp, ip)) {
- error = xfs_qm_dqattach(ip, 0);
+ error = xfs_qm_dqattach(ip);
if (error)
return error;
}
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 2975a822e9f0..e3129b280423 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -170,8 +170,10 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
-extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *,
- uint, struct qc_dqblk *, uint);
+extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
+ uint, struct qc_dqblk *);
+extern int xfs_qm_scall_getquota_next(struct xfs_mount *,
+ xfs_dqid_t *, uint, struct qc_dqblk *);
extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
struct qc_dqblk *);
extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 2be6d2735ca9..36b89e2c5eb9 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -72,7 +72,7 @@ xfs_qm_statvfs(
xfs_mount_t *mp = ip->i_mount;
xfs_dquot_t *dqp;
- if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
+ if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) {
xfs_fill_statvfs_from_dquot(statp, dqp);
xfs_qm_dqput(dqp);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 9cb5c381b01c..3e05d300b14e 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -425,7 +425,7 @@ xfs_qm_scall_setqlim(
* a reference to the dquot, so it's safe to do this unlock/lock without
* it being reclaimed in the mean time.
*/
- error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
+ error = xfs_qm_dqget(mp, id, type, true, &dqp);
if (error) {
ASSERT(error != -ENOENT);
goto out_unlock;
@@ -622,39 +622,14 @@ out:
return error;
}
-
-int
-xfs_qm_scall_getquota(
+/* Fill out the quota context. */
+static void
+xfs_qm_scall_getquota_fill_qc(
struct xfs_mount *mp,
- xfs_dqid_t *id,
uint type,
- struct qc_dqblk *dst,
- uint dqget_flags)
+ const struct xfs_dquot *dqp,
+ struct qc_dqblk *dst)
{
- struct xfs_dquot *dqp;
- int error;
-
- /*
- * Try to get the dquot. We don't want it allocated on disk, so
- * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
- * exist, we'll get ENOENT back.
- */
- error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp);
- if (error)
- return error;
-
- /*
- * If everything's NULL, this dquot doesn't quite exist as far as
- * our utility programs are concerned.
- */
- if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
- error = -ENOENT;
- goto out_put;
- }
-
- /* Fill in the ID we actually read from disk */
- *id = be32_to_cpu(dqp->q_core.d_id);
-
memset(dst, 0, sizeof(*dst));
dst->d_spc_hardlimit =
XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
@@ -696,7 +671,7 @@ xfs_qm_scall_getquota(
if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
(XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
(XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
- *id != 0) {
+ dqp->q_core.d_id != 0) {
if ((dst->d_space > dst->d_spc_softlimit) &&
(dst->d_spc_softlimit > 0)) {
ASSERT(dst->d_spc_timer != 0);
@@ -707,11 +682,69 @@ xfs_qm_scall_getquota(
}
}
#endif
+}
+
+/* Return the quota information for the dquot matching id. */
+int
+xfs_qm_scall_getquota(
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ struct qc_dqblk *dst)
+{
+ struct xfs_dquot *dqp;
+ int error;
+
+ /*
+ * Try to get the dquot. We don't want it allocated on disk, so don't
+ * set doalloc. If it doesn't exist, we'll get ENOENT back.
+ */
+ error = xfs_qm_dqget(mp, id, type, false, &dqp);
+ if (error)
+ return error;
+
+ /*
+ * If everything's NULL, this dquot doesn't quite exist as far as
+ * our utility programs are concerned.
+ */
+ if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+ error = -ENOENT;
+ goto out_put;
+ }
+
+ xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+
out_put:
xfs_qm_dqput(dqp);
return error;
}
+/*
+ * Return the quota information for the first initialized dquot whose id
+ * is at least as high as id.
+ */
+int
+xfs_qm_scall_getquota_next(
+ struct xfs_mount *mp,
+ xfs_dqid_t *id,
+ uint type,
+ struct qc_dqblk *dst)
+{
+ struct xfs_dquot *dqp;
+ int error;
+
+ error = xfs_qm_dqget_next(mp, *id, type, &dqp);
+ if (error)
+ return error;
+
+ /* Fill in the ID we actually read from disk */
+ *id = be32_to_cpu(dqp->q_core.d_id);
+
+ xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst);
+
+ xfs_qm_dqput(dqp);
+ return error;
+}
STATIC int
xfs_dqrele_inode(
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index ce6506adab7b..3edf52b14919 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -48,6 +48,22 @@ struct xfs_trans;
(XFS_IS_PQUOTA_ON(mp) && \
(mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0))
+static inline uint
+xfs_quota_chkd_flag(
+ uint dqtype)
+{
+ switch (dqtype) {
+ case XFS_DQ_USER:
+ return XFS_UQUOTA_CHKD;
+ case XFS_DQ_GROUP:
+ return XFS_GQUOTA_CHKD;
+ case XFS_DQ_PROJ:
+ return XFS_PQUOTA_CHKD;
+ default:
+ return 0;
+ }
+}
+
/*
* The structure kept inside the xfs_trans_t keep track of dquot changes
* within a transaction and apply them later.
@@ -90,8 +106,8 @@ extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *,
extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *,
struct xfs_dquot *, struct xfs_dquot *,
struct xfs_dquot *, uint);
-extern int xfs_qm_dqattach(struct xfs_inode *, uint);
-extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint);
+extern int xfs_qm_dqattach(struct xfs_inode *);
+extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc);
extern void xfs_qm_dqdetach(struct xfs_inode *);
extern void xfs_qm_dqrele(struct xfs_dquot *);
extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *);
@@ -132,7 +148,7 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
#define xfs_qm_vop_rename_dqattach(it) (0)
#define xfs_qm_vop_chown(tp, ip, old, new) (NULL)
#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0)
-#define xfs_qm_dqattach(ip, fl) (0)
+#define xfs_qm_dqattach(ip) (0)
#define xfs_qm_dqattach_locked(ip, fl) (0)
#define xfs_qm_dqdetach(ip)
#define xfs_qm_dqrele(d)
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a65108594a07..c93fc913dffb 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -239,8 +239,7 @@ xfs_fs_get_dqblk(
return -ESRCH;
id = from_kqid(&init_user_ns, qid);
- return xfs_qm_scall_getquota(mp, &id,
- xfs_quota_type(qid.type), qdq, 0);
+ return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq);
}
/* Return quota info for active quota >= this qid */
@@ -260,9 +259,8 @@ xfs_fs_get_nextdqblk(
return -ESRCH;
id = from_kqid(&init_user_ns, *qid);
- ret = xfs_qm_scall_getquota(mp, &id,
- xfs_quota_type(qid->type), qdq,
- XFS_QMOPT_DQNEXT);
+ ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type),
+ qdq);
if (ret)
return ret;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 15c9393dd7a7..e5866b714d5f 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -159,7 +159,7 @@ STATIC void
xfs_cui_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
xfs_cui_release(CUI_ITEM(lip));
}
@@ -310,7 +310,7 @@ xfs_cud_item_unlock(
{
struct xfs_cud_log_item *cudp = CUD_ITEM(lip);
- if (lip->li_flags & XFS_LI_ABORTED) {
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
xfs_cui_release(cudp->cud_cuip);
kmem_zone_free(xfs_cud_zone, cudp);
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cdbd342a5249..713e857d9ffa 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -305,7 +305,7 @@ xfs_reflink_reserve_cow(
* Fork all the shared blocks from our write offset until the end of
* the extent.
*/
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach_locked(ip, false);
if (error)
return error;
@@ -431,7 +431,7 @@ retry:
if (error)
return error;
- error = xfs_qm_dqattach_locked(ip, 0);
+ error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out;
goto retry;
@@ -552,6 +552,9 @@ xfs_reflink_trim_irec_to_next_cow(
*
* If cancel_real is true this function cancels all COW fork extents for the
* inode; if cancel_real is false, real extents are not cleared.
+ *
+ * Caller must have already joined the inode to the current transaction. The
+ * inode will be joined to the transaction returned to the caller.
*/
int
xfs_reflink_cancel_cow_blocks(
@@ -592,7 +595,6 @@ xfs_reflink_cancel_cow_blocks(
if (error)
break;
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
- xfs_trans_ijoin(*tpp, ip, 0);
xfs_defer_init(&dfops, &firstfsb);
/* Free the CoW orphan record. */
@@ -1359,7 +1361,7 @@ xfs_reflink_remap_range(
goto out_unlock;
/* Attach dquots to dest inode before changing block map */
- ret = xfs_qm_dqattach(dest, 0);
+ ret = xfs_qm_dqattach(dest);
if (ret)
goto out_unlock;
@@ -1551,7 +1553,12 @@ next:
return 0;
}
-/* Clear the inode reflink flag if there are no shared extents. */
+/*
+ * Clear the inode reflink flag if there are no shared extents.
+ *
+ * The caller is responsible for joining the inode to the transaction passed in.
+ * The inode will be joined to the transaction that is returned to the caller.
+ */
int
xfs_reflink_clear_inode_flag(
struct xfs_inode *ip,
@@ -1578,7 +1585,6 @@ xfs_reflink_clear_inode_flag(
trace_xfs_reflink_unset_inode_flag(ip);
ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
xfs_inode_clear_cowblocks_tag(ip);
- xfs_trans_ijoin(*tpp, ip, 0);
xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
return error;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 06a07846c9b3..e5b5b3e7ef82 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -158,7 +158,7 @@ STATIC void
xfs_rui_item_unlock(
struct xfs_log_item *lip)
{
- if (lip->li_flags & XFS_LI_ABORTED)
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags))
xfs_rui_release(RUI_ITEM(lip));
}
@@ -331,7 +331,7 @@ xfs_rud_item_unlock(
{
struct xfs_rud_log_item *rudp = RUD_ITEM(lip);
- if (lip->li_flags & XFS_LI_ABORTED) {
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
xfs_rui_release(rudp->rud_ruip);
kmem_zone_free(xfs_rud_zone, rudp);
}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index dfee3c991155..52632ab727f7 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,9 +23,14 @@
struct xfs_mount;
struct xfs_trans;
+/*
+ * XXX: Most of the realtime allocation functions deal in units of realtime
+ * extents, not realtime blocks. This looks funny when paired with the type
+ * name and screams for a larger cleanup.
+ */
struct xfs_rtalloc_rec {
- xfs_rtblock_t ar_startblock;
- xfs_rtblock_t ar_blockcount;
+ xfs_rtblock_t ar_startext;
+ xfs_rtblock_t ar_extcount;
};
typedef int (*xfs_rtalloc_query_range_fn)(
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f643d76db516..ed67389f4948 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1372,7 +1372,6 @@ xfs_fs_remount(
*/
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
- xfs_queue_eofblocks(mp);
/* Recover any CoW blocks that never got remapped. */
error = xfs_reflink_recover_cow(mp);
@@ -1382,7 +1381,7 @@ xfs_fs_remount(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
- xfs_queue_cowblocks(mp);
+ xfs_icache_enable_reclaim(mp);
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
@@ -1392,8 +1391,13 @@ xfs_fs_remount(
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) {
+ /*
+ * Cancel background eofb scanning so it cannot race with the
+ * final log force+buftarg wait and deadlock the remount.
+ */
+ xfs_icache_disable_reclaim(mp);
+
/* Get rid of any leftover CoW reservations... */
- cancel_delayed_work_sync(&mp->m_cowblocks_work);
error = xfs_icache_free_cowblocks(mp, NULL);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1416,12 +1420,6 @@ xfs_fs_remount(
*/
xfs_save_resvblks(mp);
- /*
- * Cancel background eofb scanning so it cannot race with the
- * final log force+buftarg wait and deadlock the remount.
- */
- cancel_delayed_work_sync(&mp->m_eofblocks_work);
-
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
}
@@ -1441,6 +1439,7 @@ xfs_fs_freeze(
{
struct xfs_mount *mp = XFS_M(sb);
+ xfs_icache_disable_reclaim(mp);
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
return xfs_sync_sb(mp, true);
@@ -1454,6 +1453,7 @@ xfs_fs_unfreeze(
xfs_restore_resvblks(mp);
xfs_log_work_queue(mp);
+ xfs_icache_enable_reclaim(mp);
return 0;
}
@@ -1635,6 +1635,17 @@ xfs_fs_fill_super(
#endif
sb->s_op = &xfs_super_operations;
+ /*
+ * Delay mount work if the debug hook is set. This is debug
+ * instrumention to coordinate simulation of xfs mount failures with
+ * VFS superblock operations
+ */
+ if (xfs_globals.mount_delay) {
+ xfs_notice(mp, "Delaying mount for %d seconds.",
+ xfs_globals.mount_delay);
+ msleep(xfs_globals.mount_delay * 1000);
+ }
+
if (silent)
flags |= XFS_MFSI_QUIET;
@@ -1690,11 +1701,17 @@ xfs_fs_fill_super(
sb->s_flags |= SB_I_VERSION;
if (mp->m_flags & XFS_MOUNT_DAX) {
+ bool rtdev_is_dax = false, datadev_is_dax;
+
xfs_warn(mp,
"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- error = bdev_dax_supported(sb, sb->s_blocksize);
- if (error) {
+ datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev,
+ sb->s_blocksize);
+ if (mp->m_rtdev_targp)
+ rtdev_is_dax = bdev_dax_supported(
+ mp->m_rtdev_targp->bt_bdev, sb->s_blocksize);
+ if (!rtdev_is_dax && !datadev_is_dax) {
xfs_alert(mp,
"DAX unsupported by block device. Turning off DAX.");
mp->m_flags &= ~XFS_MOUNT_DAX;
@@ -1761,6 +1778,7 @@ xfs_fs_fill_super(
out_close_devices:
xfs_close_devices(mp);
out_free_fsname:
+ sb->s_fs_info = NULL;
xfs_free_fsname(mp);
kfree(mp);
out:
@@ -1778,6 +1796,10 @@ xfs_fs_put_super(
{
struct xfs_mount *mp = XFS_M(sb);
+ /* if ->fill_super failed, we have no mount to tear down */
+ if (!sb->s_fs_info)
+ return;
+
xfs_notice(mp, "Unmounting Filesystem");
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
@@ -1787,6 +1809,8 @@ xfs_fs_put_super(
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
+
+ sb->s_fs_info = NULL;
xfs_free_fsname(mp);
kfree(mp);
}
@@ -1806,6 +1830,9 @@ xfs_fs_nr_cached_objects(
struct super_block *sb,
struct shrink_control *sc)
{
+ /* Paranoia: catch incorrect calls during mount setup or teardown */
+ if (WARN_ON_ONCE(!sb->s_fs_info))
+ return 0;
return xfs_reclaim_inodes_count(XFS_M(sb));
}
@@ -1879,11 +1906,6 @@ xfs_init_zones(void)
if (!xfs_trans_zone)
goto out_destroy_ifork_zone;
- xfs_log_item_desc_zone =
- kmem_zone_init(sizeof(struct xfs_log_item_desc),
- "xfs_log_item_desc");
- if (!xfs_log_item_desc_zone)
- goto out_destroy_trans_zone;
/*
* The size of the zone allocated buf log item is the maximum
@@ -1893,7 +1915,7 @@ xfs_init_zones(void)
xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item),
"xfs_buf_item");
if (!xfs_buf_item_zone)
- goto out_destroy_log_item_desc_zone;
+ goto out_destroy_trans_zone;
xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
((XFS_EFD_MAX_FAST_EXTENTS - 1) *
@@ -1981,8 +2003,6 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_efd_zone);
out_destroy_buf_item_zone:
kmem_zone_destroy(xfs_buf_item_zone);
- out_destroy_log_item_desc_zone:
- kmem_zone_destroy(xfs_log_item_desc_zone);
out_destroy_trans_zone:
kmem_zone_destroy(xfs_trans_zone);
out_destroy_ifork_zone:
@@ -2021,7 +2041,6 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_efi_zone);
kmem_zone_destroy(xfs_efd_zone);
kmem_zone_destroy(xfs_buf_item_zone);
- kmem_zone_destroy(xfs_log_item_desc_zone);
kmem_zone_destroy(xfs_trans_zone);
kmem_zone_destroy(xfs_ifork_zone);
kmem_zone_destroy(xfs_da_state_zone);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 5b66ac12913c..aed03da637d4 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -259,6 +259,7 @@ xfs_symlink(
* bmapi or the directory create code.
*/
xfs_defer_init(&dfops, &first_block);
+ tp->t_agfl_dfops = &dfops;
/*
* Allocate an inode for the symlink.
@@ -488,16 +489,11 @@ xfs_inactive_symlink_rmt(
error = xfs_defer_finish(&tp, &dfops);
if (error)
goto error_bmap_cancel;
- /*
- * The first xact was committed, so add the inode to the new one.
- * Mark it dirty so it will be logged and moved forward in the log as
- * part of every commit.
- */
- xfs_trans_ijoin(tp, ip, 0);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
/*
* Commit the transaction containing extent freeing and EFDs.
*/
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_trans_commit(tp);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 82afee005140..b53a33e69932 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -95,6 +95,7 @@ extern xfs_param_t xfs_params;
struct xfs_globals {
int log_recovery_delay; /* log recovery delay (secs) */
+ int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
};
extern struct xfs_globals xfs_globals;
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 8b2ccc234f36..2d5cd2529f8e 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -165,9 +165,40 @@ log_recovery_delay_show(
}
XFS_SYSFS_ATTR_RW(log_recovery_delay);
+STATIC ssize_t
+mount_delay_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < 0 || val > 60)
+ return -EINVAL;
+
+ xfs_globals.mount_delay = val;
+
+ return count;
+}
+
+STATIC ssize_t
+mount_delay_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay);
+}
+XFS_SYSFS_ATTR_RW(mount_delay);
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
+ ATTR_LIST(mount_delay),
NULL,
};
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 8955254b900e..9d4c4ca24fe6 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -441,8 +441,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__field(unsigned, bli_recur)
__field(int, bli_refcount)
__field(unsigned, bli_flags)
- __field(void *, li_desc)
- __field(unsigned, li_flags)
+ __field(unsigned long, li_flags)
),
TP_fast_assign(
__entry->dev = bip->bli_buf->b_target->bt_dev;
@@ -455,12 +454,11 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count;
- __entry->li_desc = bip->bli_item.li_desc;
__entry->li_flags = bip->bli_item.li_flags;
),
TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
"lock %d flags %s recur %d refcount %d bliflags %s "
- "lidesc %p liflags %s",
+ "liflags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->buf_bno,
__entry->buf_len,
@@ -471,7 +469,6 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->bli_recur,
__entry->bli_refcount,
__print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
- __entry->li_desc,
__print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
)
@@ -1018,7 +1015,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class,
__field(dev_t, dev)
__field(void *, lip)
__field(uint, type)
- __field(uint, flags)
+ __field(unsigned long, flags)
__field(xfs_lsn_t, lsn)
),
TP_fast_assign(
@@ -1070,7 +1067,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class,
__field(dev_t, dev)
__field(void *, lip)
__field(uint, type)
- __field(uint, flags)
+ __field(unsigned long, flags)
__field(xfs_lsn_t, old_lsn)
__field(xfs_lsn_t, new_lsn)
),
@@ -1750,6 +1747,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__field(int, namelen)
__field(int, valuelen)
__field(xfs_dahash_t, hashval)
+ __field(int, flags)
__field(int, op_flags)
),
TP_fast_assign(
@@ -1760,10 +1758,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->namelen = args->namelen;
__entry->valuelen = args->valuelen;
__entry->hashval = args->hashval;
+ __entry->flags = args->flags;
__entry->op_flags = args->op_flags;
),
TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
- "hashval 0x%x op_flags %s",
+ "hashval 0x%x flags %s op_flags %s",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->namelen,
@@ -1771,6 +1770,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class,
__entry->namelen,
__entry->valuelen,
__entry->hashval,
+ __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
__print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
)
@@ -2243,30 +2243,35 @@ struct xfs_defer_pending;
struct xfs_defer_ops;
DECLARE_EVENT_CLASS(xfs_defer_class,
- TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop),
- TP_ARGS(mp, dop),
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop,
+ unsigned long caller_ip),
+ TP_ARGS(mp, dop, caller_ip),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(void *, dop)
__field(char, committed)
__field(char, low)
+ __field(unsigned long, caller_ip)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
__entry->dop = dop;
__entry->committed = dop->dop_committed;
__entry->low = dop->dop_low;
+ __entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ops %p committed %d low %d",
+ TP_printk("dev %d:%d ops %p committed %d low %d, caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->dop,
__entry->committed,
- __entry->low)
+ __entry->low,
+ (char *)__entry->caller_ip)
)
#define DEFINE_DEFER_EVENT(name) \
DEFINE_EVENT(xfs_defer_class, name, \
- TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \
- TP_ARGS(mp, dop))
+ TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, \
+ unsigned long caller_ip), \
+ TP_ARGS(mp, dop, caller_ip))
DECLARE_EVENT_CLASS(xfs_defer_error_class,
TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error),
@@ -2433,6 +2438,8 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
+DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
/* rmap tracepoints */
DECLARE_EVENT_CLASS(xfs_rmap_class,
@@ -3346,6 +3353,43 @@ TRACE_EVENT(xfs_trans_resv_calc,
__entry->logflags)
);
+DECLARE_EVENT_CLASS(xfs_trans_class,
+ TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip),
+ TP_ARGS(tp, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint32_t, tid)
+ __field(uint32_t, flags)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = tp->t_mountp->m_super->s_dev;
+ __entry->tid = 0;
+ if (tp->t_ticket)
+ __entry->tid = tp->t_ticket->t_tid;
+ __entry->flags = tp->t_flags;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d trans %x flags 0x%x caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->tid,
+ __entry->flags,
+ (char *)__entry->caller_ip)
+)
+
+#define DEFINE_TRANS_EVENT(name) \
+DEFINE_EVENT(xfs_trans_class, name, \
+ TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \
+ TP_ARGS(tp, caller_ip))
+DEFINE_TRANS_EVENT(xfs_trans_alloc);
+DEFINE_TRANS_EVENT(xfs_trans_cancel);
+DEFINE_TRANS_EVENT(xfs_trans_commit);
+DEFINE_TRANS_EVENT(xfs_trans_dup);
+DEFINE_TRANS_EVENT(xfs_trans_free);
+DEFINE_TRANS_EVENT(xfs_trans_roll);
+DEFINE_TRANS_EVENT(xfs_trans_add_item);
+DEFINE_TRANS_EVENT(xfs_trans_free_items);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d6d8f9d129a7..fc7ba75b8b69 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -31,9 +31,9 @@
#include "xfs_log.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_defer.h"
kmem_zone_t *xfs_trans_zone;
-kmem_zone_t *xfs_log_item_desc_zone;
#if defined(CONFIG_TRACEPOINTS)
static void
@@ -79,6 +79,7 @@ xfs_trans_free(
xfs_extent_busy_sort(&tp->t_busy);
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
+ trace_xfs_trans_free(tp, _RET_IP_);
atomic_dec(&tp->t_mountp->m_active_trans);
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
@@ -94,11 +95,13 @@ xfs_trans_free(
* blocks. Locks and log items, however, are no inherited. They must
* be added to the new transaction explicitly.
*/
-STATIC xfs_trans_t *
+STATIC struct xfs_trans *
xfs_trans_dup(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
- xfs_trans_t *ntp;
+ struct xfs_trans *ntp;
+
+ trace_xfs_trans_dup(tp, _RET_IP_);
ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP);
@@ -127,6 +130,7 @@ xfs_trans_dup(
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
ntp->t_pflags = tp->t_pflags;
+ ntp->t_agfl_dfops = tp->t_agfl_dfops;
xfs_trans_dup_dqinfo(tp, ntp);
@@ -283,6 +287,8 @@ xfs_trans_alloc(
return error;
}
+ trace_xfs_trans_alloc(tp, _RET_IP_);
+
*tpp = tp;
return 0;
}
@@ -727,73 +733,52 @@ out:
return;
}
-/*
- * Add the given log item to the transaction's list of log items.
- *
- * The log item will now point to its new descriptor with its li_desc field.
- */
+/* Add the given log item to the transaction's list of log items. */
void
xfs_trans_add_item(
struct xfs_trans *tp,
struct xfs_log_item *lip)
{
- struct xfs_log_item_desc *lidp;
-
ASSERT(lip->li_mountp == tp->t_mountp);
ASSERT(lip->li_ailp == tp->t_mountp->m_ail);
+ ASSERT(list_empty(&lip->li_trans));
+ ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags));
- lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS);
-
- lidp->lid_item = lip;
- lidp->lid_flags = 0;
- list_add_tail(&lidp->lid_trans, &tp->t_items);
-
- lip->li_desc = lidp;
-}
-
-STATIC void
-xfs_trans_free_item_desc(
- struct xfs_log_item_desc *lidp)
-{
- list_del_init(&lidp->lid_trans);
- kmem_zone_free(xfs_log_item_desc_zone, lidp);
+ list_add_tail(&lip->li_trans, &tp->t_items);
+ trace_xfs_trans_add_item(tp, _RET_IP_);
}
/*
- * Unlink and free the given descriptor.
+ * Unlink the log item from the transaction. the log item is no longer
+ * considered dirty in this transaction, as the linked transaction has
+ * finished, either by abort or commit completion.
*/
void
xfs_trans_del_item(
struct xfs_log_item *lip)
{
- xfs_trans_free_item_desc(lip->li_desc);
- lip->li_desc = NULL;
+ clear_bit(XFS_LI_DIRTY, &lip->li_flags);
+ list_del_init(&lip->li_trans);
}
-/*
- * Unlock all of the items of a transaction and free all the descriptors
- * of that transaction.
- */
+/* Detach and unlock all of the items in a transaction */
void
xfs_trans_free_items(
struct xfs_trans *tp,
xfs_lsn_t commit_lsn,
bool abort)
{
- struct xfs_log_item_desc *lidp, *next;
-
- list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) {
- struct xfs_log_item *lip = lidp->lid_item;
+ struct xfs_log_item *lip, *next;
- lip->li_desc = NULL;
+ trace_xfs_trans_free_items(tp, _RET_IP_);
+ list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
+ xfs_trans_del_item(lip);
if (commit_lsn != NULLCOMMITLSN)
lip->li_ops->iop_committing(lip, commit_lsn);
if (abort)
- lip->li_flags |= XFS_LI_ABORTED;
+ set_bit(XFS_LI_ABORTED, &lip->li_flags);
lip->li_ops->iop_unlock(lip);
-
- xfs_trans_free_item_desc(lidp);
}
}
@@ -861,7 +846,7 @@ xfs_trans_committed_bulk(
xfs_lsn_t item_lsn;
if (aborted)
- lip->li_flags |= XFS_LI_ABORTED;
+ set_bit(XFS_LI_ABORTED, &lip->li_flags);
item_lsn = lip->li_ops->iop_committed(lip, commit_lsn);
/* item_lsn of -1 means the item needs no further processing */
@@ -936,6 +921,11 @@ __xfs_trans_commit(
int error = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
+ ASSERT(!tp->t_agfl_dfops ||
+ !xfs_defer_has_unfinished_work(tp->t_agfl_dfops) || regrant);
+
+ trace_xfs_trans_commit(tp, _RET_IP_);
+
/*
* If there is nothing to be logged by the transaction,
* then unlock all of the items associated with the
@@ -991,6 +981,7 @@ out_unreserve:
commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (commit_lsn == -1 && !error)
error = -EIO;
+ tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
@@ -1022,6 +1013,8 @@ xfs_trans_cancel(
struct xfs_mount *mp = tp->t_mountp;
bool dirty = (tp->t_flags & XFS_TRANS_DIRTY);
+ trace_xfs_trans_cancel(tp, _RET_IP_);
+
/*
* See if the caller is relying on us to shut down the
* filesystem. This happens in paths where we detect
@@ -1033,17 +1026,19 @@ xfs_trans_cancel(
}
#ifdef DEBUG
if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) {
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
- list_for_each_entry(lidp, &tp->t_items, lid_trans)
- ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD));
+ list_for_each_entry(lip, &tp->t_items, li_trans)
+ ASSERT(!(lip->li_type == XFS_LI_EFD));
}
#endif
xfs_trans_unreserve_and_mod_sb(tp);
xfs_trans_unreserve_and_mod_dquots(tp);
- if (tp->t_ticket)
+ if (tp->t_ticket) {
xfs_log_done(mp, tp->t_ticket, NULL, false);
+ tp->t_ticket = NULL;
+ }
/* mark this thread as no longer being in a transaction */
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
@@ -1067,6 +1062,8 @@ xfs_trans_roll(
struct xfs_trans_res tres;
int error;
+ trace_xfs_trans_roll(trans, _RET_IP_);
+
/*
* Copy the critical parameters from one trans to the next.
*/
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9d542dfe0052..29706b8b3bd4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -27,7 +27,6 @@ struct xfs_efi_log_item;
struct xfs_inode;
struct xfs_item_ops;
struct xfs_log_iovec;
-struct xfs_log_item_desc;
struct xfs_mount;
struct xfs_trans;
struct xfs_trans_res;
@@ -43,12 +42,12 @@ struct xfs_bud_log_item;
typedef struct xfs_log_item {
struct list_head li_ail; /* AIL pointers */
+ struct list_head li_trans; /* transaction list */
xfs_lsn_t li_lsn; /* last on-disk lsn */
- struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
struct xfs_mount *li_mountp; /* ptr to fs mount */
struct xfs_ail *li_ailp; /* ptr to AIL */
uint li_type; /* item type */
- uint li_flags; /* misc flags */
+ unsigned long li_flags; /* misc flags */
struct xfs_buf *li_buf; /* real buffer pointer */
struct list_head li_bio_list; /* buffer item list */
void (*li_cb)(struct xfs_buf *,
@@ -64,14 +63,21 @@ typedef struct xfs_log_item {
xfs_lsn_t li_seq; /* CIL commit seq */
} xfs_log_item_t;
-#define XFS_LI_IN_AIL 0x1
-#define XFS_LI_ABORTED 0x2
-#define XFS_LI_FAILED 0x4
+/*
+ * li_flags use the (set/test/clear)_bit atomic interfaces because updates can
+ * race with each other and we don't want to have to use the AIL lock to
+ * serialise all updates.
+ */
+#define XFS_LI_IN_AIL 0
+#define XFS_LI_ABORTED 1
+#define XFS_LI_FAILED 2
+#define XFS_LI_DIRTY 3 /* log item dirty in transaction */
#define XFS_LI_FLAGS \
- { XFS_LI_IN_AIL, "IN_AIL" }, \
- { XFS_LI_ABORTED, "ABORTED" }, \
- { XFS_LI_FAILED, "FAILED" }
+ { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \
+ { (1 << XFS_LI_ABORTED), "ABORTED" }, \
+ { (1 << XFS_LI_FAILED), "FAILED" }, \
+ { (1 << XFS_LI_DIRTY), "DIRTY" }
struct xfs_item_ops {
void (*iop_size)(xfs_log_item_t *, int *, int *);
@@ -111,6 +117,7 @@ typedef struct xfs_trans {
struct xlog_ticket *t_ticket; /* log mgr ticket */
struct xfs_mount *t_mountp; /* ptr to fs mount struct */
struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
+ struct xfs_defer_ops *t_agfl_dfops; /* optional agfl fixup dfops */
unsigned int t_flags; /* misc flags */
int64_t t_icount_delta; /* superblock icount change */
int64_t t_ifree_delta; /* superblock ifree change */
@@ -228,7 +235,8 @@ struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *,
uint);
int xfs_trans_free_extent(struct xfs_trans *,
struct xfs_efd_log_item *, xfs_fsblock_t,
- xfs_extlen_t, struct xfs_owner_info *);
+ xfs_extlen_t, struct xfs_owner_info *,
+ bool);
int xfs_trans_commit(struct xfs_trans *);
int xfs_trans_roll(struct xfs_trans **);
int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *);
@@ -242,7 +250,6 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
struct xfs_buf *src_bp);
extern kmem_zone_t *xfs_trans_zone;
-extern kmem_zone_t *xfs_log_item_desc_zone;
/* rmap updates */
enum xfs_rmap_intent_type;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d4a2445215e6..41e280ef1483 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -32,30 +32,51 @@
#ifdef DEBUG
/*
* Check that the list is sorted as it should be.
+ *
+ * Called with the ail lock held, but we don't want to assert fail with it
+ * held otherwise we'll lock everything up and won't be able to debug the
+ * cause. Hence we sample and check the state under the AIL lock and return if
+ * everything is fine, otherwise we drop the lock and run the ASSERT checks.
+ * Asserts may not be fatal, so pick the lock back up and continue onwards.
*/
STATIC void
xfs_ail_check(
- struct xfs_ail *ailp,
- xfs_log_item_t *lip)
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
{
- xfs_log_item_t *prev_lip;
+ struct xfs_log_item *prev_lip;
+ struct xfs_log_item *next_lip;
+ xfs_lsn_t prev_lsn = NULLCOMMITLSN;
+ xfs_lsn_t next_lsn = NULLCOMMITLSN;
+ xfs_lsn_t lsn;
+ bool in_ail;
+
if (list_empty(&ailp->ail_head))
return;
/*
- * Check the next and previous entries are valid.
+ * Sample then check the next and previous entries are valid.
*/
- ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
- prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
+ in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags);
+ prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail);
if (&prev_lip->li_ail != &ailp->ail_head)
- ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-
- prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
- if (&prev_lip->li_ail != &ailp->ail_head)
- ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
+ prev_lsn = prev_lip->li_lsn;
+ next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail);
+ if (&next_lip->li_ail != &ailp->ail_head)
+ next_lsn = next_lip->li_lsn;
+ lsn = lip->li_lsn;
+ if (in_ail &&
+ (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) &&
+ (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0))
+ return;
+ spin_unlock(&ailp->ail_lock);
+ ASSERT(in_ail);
+ ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0);
+ ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0);
+ spin_lock(&ailp->ail_lock);
}
#else /* !DEBUG */
#define xfs_ail_check(a,l)
@@ -684,7 +705,7 @@ xfs_trans_ail_update_bulk(
for (i = 0; i < nr_items; i++) {
struct xfs_log_item *lip = log_items[i];
- if (lip->li_flags & XFS_LI_IN_AIL) {
+ if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
/* check if we really need to move the item */
if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
continue;
@@ -694,7 +715,6 @@ xfs_trans_ail_update_bulk(
if (mlip == lip)
mlip_changed = 1;
} else {
- lip->li_flags |= XFS_LI_IN_AIL;
trace_xfs_ail_insert(lip, 0, lsn);
}
lip->li_lsn = lsn;
@@ -725,7 +745,7 @@ xfs_ail_delete_one(
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
xfs_clear_li_failed(lip);
- lip->li_flags &= ~XFS_LI_IN_AIL;
+ clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
return mlip == lip;
@@ -761,7 +781,7 @@ xfs_trans_ail_delete(
struct xfs_mount *mp = ailp->ail_mount;
bool mlip_changed;
- if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+ if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
index 14543d93cd4b..230a21df4b12 100644
--- a/fs/xfs/xfs_trans_bmap.c
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -79,7 +79,7 @@ xfs_trans_log_finish_bmap_update(
* 2.) shuts down the filesystem
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
return error;
}
@@ -158,7 +158,7 @@ xfs_bmap_update_log_item(
bmap = container_of(item, struct xfs_bmap_intent, bi_list);
tp->t_flags |= XFS_TRANS_DIRTY;
- buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
/*
* atomic_inc_return gives us the value after the increment;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index a5d9dfc45d98..a8ddb4eed279 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -40,7 +40,7 @@ xfs_trans_buf_item_match(
struct xfs_buf_map *map,
int nmaps)
{
- struct xfs_log_item_desc *lidp;
+ struct xfs_log_item *lip;
struct xfs_buf_log_item *blip;
int len = 0;
int i;
@@ -48,8 +48,8 @@ xfs_trans_buf_item_match(
for (i = 0; i < nmaps; i++)
len += map[i].bm_len;
- list_for_each_entry(lidp, &tp->t_items, lid_trans) {
- blip = (struct xfs_buf_log_item *)lidp->lid_item;
+ list_for_each_entry(lip, &tp->t_items, li_trans) {
+ blip = (struct xfs_buf_log_item *)lip;
if (blip->bli_item.li_type == XFS_LI_BUF &&
blip->bli_buf->b_target == target &&
XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn &&
@@ -100,14 +100,10 @@ _xfs_trans_bjoin(
atomic_inc(&bip->bli_refcount);
/*
- * Get a log_item_desc to point at the new item.
+ * Attach the item to the transaction so we can find it in
+ * xfs_trans_get_buf() and friends.
*/
xfs_trans_add_item(tp, &bip->bli_item);
-
- /*
- * Initialize b_fsprivate2 so we can find it with incore_match()
- * in xfs_trans_get_buf() and friends above.
- */
bp->b_transp = tp;
}
@@ -391,7 +387,7 @@ xfs_trans_brelse(
* If the buffer is dirty within this transaction, we can't
* release it until we commit.
*/
- if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY)
+ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags))
return;
/*
@@ -442,7 +438,7 @@ xfs_trans_brelse(
ASSERT(bp->b_pincount == 0);
***/
ASSERT(atomic_read(&bip->bli_refcount) == 0);
- ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
xfs_buf_item_relse(bp);
}
@@ -542,7 +538,7 @@ xfs_trans_dirty_buf(
bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
tp->t_flags |= XFS_TRANS_DIRTY;
- bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
}
/*
@@ -626,7 +622,7 @@ xfs_trans_binval(
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK));
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
- ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
+ ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
return;
}
@@ -642,7 +638,7 @@ xfs_trans_binval(
memset(bip->bli_formats[i].blf_data_map, 0,
(bip->bli_formats[i].blf_map_size * sizeof(uint)));
}
- bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
tp->t_flags |= XFS_TRANS_DIRTY;
}
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index c3d547211d16..c381c02cca45 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -77,7 +77,7 @@ xfs_trans_log_dquot(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
tp->t_flags |= XFS_TRANS_DIRTY;
- dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags);
}
/*
@@ -879,7 +879,7 @@ xfs_trans_log_quotaoff_item(
xfs_qoff_logitem_t *qlp)
{
tp->t_flags |= XFS_TRANS_DIRTY;
- qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
}
STATIC void
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index ab438647592a..2f44a08bdf65 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -68,7 +68,8 @@ xfs_trans_free_extent(
struct xfs_efd_log_item *efdp,
xfs_fsblock_t start_block,
xfs_extlen_t ext_len,
- struct xfs_owner_info *oinfo)
+ struct xfs_owner_info *oinfo,
+ bool skip_discard)
{
struct xfs_mount *mp = tp->t_mountp;
uint next_extent;
@@ -79,9 +80,8 @@ xfs_trans_free_extent(
trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len);
- error = xfs_free_extent(tp, start_block, ext_len, oinfo,
- XFS_AG_RESV_NONE);
-
+ error = __xfs_free_extent(tp, start_block, ext_len,
+ oinfo, XFS_AG_RESV_NONE, skip_discard);
/*
* Mark the transaction dirty, even on error. This ensures the
* transaction is aborted, which:
@@ -90,7 +90,7 @@ xfs_trans_free_extent(
* 2.) shuts down the filesystem
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
@@ -155,7 +155,7 @@ xfs_extent_free_log_item(
free = container_of(item, struct xfs_extent_free_item, xefi_list);
tp->t_flags |= XFS_TRANS_DIRTY;
- efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
/*
* atomic_inc_return gives us the value after the increment;
@@ -195,7 +195,7 @@ xfs_extent_free_finish_item(
error = xfs_trans_free_extent(tp, done_item,
free->xefi_startblock,
free->xefi_blockcount,
- &free->xefi_oinfo);
+ &free->xefi_oinfo, free->xefi_skip_discard);
kmem_free(free);
return error;
}
@@ -231,9 +231,79 @@ static const struct xfs_defer_op_type xfs_extent_free_defer_type = {
.cancel_item = xfs_extent_free_cancel_item,
};
+/*
+ * AGFL blocks are accounted differently in the reserve pools and are not
+ * inserted into the busy extent list.
+ */
+STATIC int
+xfs_agfl_free_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_defer_ops *dop,
+ struct list_head *item,
+ void *done_item,
+ void **state)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_efd_log_item *efdp = done_item;
+ struct xfs_extent_free_item *free;
+ struct xfs_extent *extp;
+ struct xfs_buf *agbp;
+ int error;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ uint next_extent;
+
+ free = container_of(item, struct xfs_extent_free_item, xefi_list);
+ ASSERT(free->xefi_blockcount == 1);
+ agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock);
+ agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock);
+
+ trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount);
+
+ error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
+ if (!error)
+ error = xfs_free_agfl_block(tp, agno, agbno, agbp,
+ &free->xefi_oinfo);
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the EFI and frees the EFD
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = free->xefi_startblock;
+ extp->ext_len = free->xefi_blockcount;
+ efdp->efd_next_extent++;
+
+ kmem_free(free);
+ return error;
+}
+
+
+/* sub-type with special handling for AGFL deferred frees */
+static const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ .type = XFS_DEFER_OPS_TYPE_AGFL_FREE,
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_agfl_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+};
+
/* Register the deferred op type. */
void
xfs_extent_free_init_defer_op(void)
{
xfs_defer_init_op_type(&xfs_extent_free_defer_type);
+ xfs_defer_init_op_type(&xfs_agfl_free_defer_type);
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 07cea592dc01..f7bd7960a90f 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -133,14 +133,13 @@ xfs_trans_log_inode(
* set however, then go ahead and bump the i_version counter
* unconditionally.
*/
- if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
+ if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) &&
IS_I_VERSION(VFS_I(ip))) {
if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE))
flags |= XFS_ILOG_CORE;
}
tp->t_flags |= XFS_TRANS_DIRTY;
- ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY;
/*
* Always OR in the bits from the ili_last_fields field.
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index be24b0c8a332..9717ae74b36d 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -19,7 +19,6 @@
#define __XFS_TRANS_PRIV_H__
struct xfs_log_item;
-struct xfs_log_item_desc;
struct xfs_mount;
struct xfs_trans;
struct xfs_ail;
@@ -119,7 +118,7 @@ xfs_trans_ail_remove(
spin_lock(&ailp->ail_lock);
/* xfs_trans_ail_delete() drops the AIL lock */
- if (lip->li_flags & XFS_LI_IN_AIL)
+ if (test_bit(XFS_LI_IN_AIL, &lip->li_flags))
xfs_trans_ail_delete(ailp, lip, shutdown_type);
else
spin_unlock(&ailp->ail_lock);
@@ -171,11 +170,10 @@ xfs_clear_li_failed(
{
struct xfs_buf *bp = lip->li_buf;
- ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+ ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
lockdep_assert_held(&lip->li_ailp->ail_lock);
- if (lip->li_flags & XFS_LI_FAILED) {
- lip->li_flags &= ~XFS_LI_FAILED;
+ if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
lip->li_buf = NULL;
xfs_buf_rele(bp);
}
@@ -188,9 +186,8 @@ xfs_set_li_failed(
{
lockdep_assert_held(&lip->li_ailp->ail_lock);
- if (!(lip->li_flags & XFS_LI_FAILED)) {
+ if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
xfs_buf_hold(bp);
- lip->li_flags |= XFS_LI_FAILED;
lip->li_buf = bp;
}
}
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
index 94c1877af834..c7f8e82f5bda 100644
--- a/fs/xfs/xfs_trans_refcount.c
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -77,7 +77,7 @@ xfs_trans_log_finish_refcount_update(
* 2.) shuts down the filesystem
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
return error;
}
@@ -154,7 +154,7 @@ xfs_refcount_update_log_item(
refc = container_of(item, struct xfs_refcount_intent, ri_list);
tp->t_flags |= XFS_TRANS_DIRTY;
- cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
/*
* atomic_inc_return gives us the value after the increment;
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index 9b577beb43d7..5831ca0c270b 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -117,7 +117,7 @@ xfs_trans_log_finish_rmap_update(
* 2.) shuts down the filesystem
*/
tp->t_flags |= XFS_TRANS_DIRTY;
- rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
return error;
}
@@ -175,7 +175,7 @@ xfs_rmap_update_log_item(
rmap = container_of(item, struct xfs_rmap_intent, ri_list);
tp->t_flags |= XFS_TRANS_DIRTY;
- ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
/*
* atomic_inc_return gives us the value after the increment;