summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/amigaffs.c27
-rw-r--r--fs/affs/file.c26
-rw-r--r--fs/afs/fs_probe.c4
-rw-r--r--fs/afs/inode.c47
-rw-r--r--fs/afs/internal.h15
-rw-r--r--fs/afs/proc.c5
-rw-r--r--fs/afs/vl_list.c1
-rw-r--r--fs/afs/vl_probe.c82
-rw-r--r--fs/afs/vl_rotate.c7
-rw-r--r--fs/afs/write.c11
-rw-r--r--fs/aio.c8
-rw-r--r--fs/autofs/waitq.c2
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/block-group.c70
-rw-r--r--fs/btrfs/btrfs_inode.h30
-rw-r--r--fs/btrfs/compression.c35
-rw-r--r--fs/btrfs/compression.h35
-rw-r--r--fs/btrfs/ctree.c210
-rw-r--r--fs/btrfs/ctree.h103
-rw-r--r--fs/btrfs/delalloc-space.c123
-rw-r--r--fs/btrfs/delayed-inode.c6
-rw-r--r--fs/btrfs/dev-replace.c118
-rw-r--r--fs/btrfs/disk-io.c170
-rw-r--r--fs/btrfs/disk-io.h9
-rw-r--r--fs/btrfs/extent-io-tree.h3
-rw-r--r--fs/btrfs/extent-tree.c227
-rw-r--r--fs/btrfs/extent_io.c224
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c316
-rw-r--r--fs/btrfs/free-space-cache.c23
-rw-r--r--fs/btrfs/free-space-tree.c4
-rw-r--r--fs/btrfs/inode.c788
-rw-r--r--fs/btrfs/ioctl.c96
-rw-r--r--fs/btrfs/locking.c45
-rw-r--r--fs/btrfs/locking.h78
-rw-r--r--fs/btrfs/ordered-data.c113
-rw-r--r--fs/btrfs/ordered-data.h24
-rw-r--r--fs/btrfs/print-tree.c50
-rw-r--r--fs/btrfs/print-tree.h4
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/reada.c30
-rw-r--r--fs/btrfs/reflink.c46
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/root-tree.c13
-rw-r--r--fs/btrfs/scrub.c130
-rw-r--r--fs/btrfs/send.c365
-rw-r--r--fs/btrfs/send.h1
-rw-r--r--fs/btrfs/space-info.c323
-rw-r--r--fs/btrfs/space-info.h2
-rw-r--r--fs/btrfs/struct-funcs.c10
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/sysfs.c249
-rw-r--r--fs/btrfs/sysfs.h11
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c7
-rw-r--r--fs/btrfs/transaction.c16
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-checker.c19
-rw-r--r--fs/btrfs/tree-log.c284
-rw-r--r--fs/btrfs/tree-log.h32
-rw-r--r--fs/btrfs/volumes.c429
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/smb2ops.c2
-rw-r--r--fs/compat.c132
-rw-r--r--fs/crypto/crypto.c4
-rw-r--r--fs/crypto/fname.c60
-rw-r--r--fs/crypto/fscrypt_private.h10
-rw-r--r--fs/crypto/hooks.c80
-rw-r--r--fs/crypto/inline_crypt.c7
-rw-r--r--fs/crypto/keyring.c9
-rw-r--r--fs/crypto/keysetup.c182
-rw-r--r--fs/crypto/keysetup_v1.c8
-rw-r--r--fs/crypto/policy.c209
-rw-r--r--fs/debugfs/file.c4
-rw-r--r--fs/direct-io.c19
-rw-r--r--fs/dlm/Kconfig1
-rw-r--r--fs/dlm/config.c66
-rw-r--r--fs/dlm/config.h4
-rw-r--r--fs/dlm/lowcomms.c329
-rw-r--r--fs/dlm/midcomms.c136
-rw-r--r--fs/dlm/midcomms.h3
-rw-r--r--fs/efivarfs/super.c3
-rw-r--r--fs/erofs/data.c2
-rw-r--r--fs/erofs/super.c2
-rw-r--r--fs/erofs/xattr.c2
-rw-r--r--fs/erofs/zdata.c48
-rw-r--r--fs/eventpoll.c78
-rw-r--r--fs/exfat/cache.c11
-rw-r--r--fs/exfat/exfat_fs.h3
-rw-r--r--fs/exfat/inode.c2
-rw-r--r--fs/exfat/namei.c13
-rw-r--r--fs/exfat/super.c5
-rw-r--r--fs/ext2/file.c6
-rw-r--r--fs/ext4/dir.c2
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/ialloc.c119
-rw-r--r--fs/ext4/namei.c7
-rw-r--r--fs/ext4/super.c16
-rw-r--r--fs/f2fs/data.c3
-rw-r--r--fs/f2fs/dir.c6
-rw-r--r--fs/f2fs/f2fs.h25
-rw-r--r--fs/f2fs/namei.c7
-rw-r--r--fs/f2fs/node.c3
-rw-r--r--fs/f2fs/segment.c8
-rw-r--r--fs/f2fs/super.c15
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/file.c25
-rw-r--r--fs/internal.h3
-rw-r--r--fs/io_uring.c174
-rw-r--r--fs/namespace.c29
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c43
-rw-r--r--fs/nfs/fs_context.c195
-rw-r--r--fs/nfs/nfs42proc.c10
-rw-r--r--fs/nfs/nfs4proc.c11
-rw-r--r--fs/pipe.c73
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/quota/Kconfig5
-rw-r--r--fs/quota/Makefile1
-rw-r--r--fs/quota/compat.c120
-rw-r--r--fs/quota/compat.h34
-rw-r--r--fs/quota/quota.c73
-rw-r--r--fs/read_write.c370
-rw-r--r--fs/splice.c85
-rw-r--r--fs/ubifs/dir.c40
-rw-r--r--fs/vboxsf/super.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c8
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_file.c12
137 files changed, 4463 insertions, 3716 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 1c7b0e3f6daa..d72ee2ce7af0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -37,7 +37,6 @@ obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FS_ENCRYPTION) += crypto/
obj-$(CONFIG_FS_VERITY) += verity/
obj-$(CONFIG_FILE_LOCKING) += locks.o
-obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index f708c45d5f66..29f11e10a7c7 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -420,24 +420,51 @@ affs_mode_to_prot(struct inode *inode)
u32 prot = AFFS_I(inode)->i_protect;
umode_t mode = inode->i_mode;
+ /*
+ * First, clear all RWED bits for owner, group, other.
+ * Then, recalculate them afresh.
+ *
+ * We'll always clear the delete-inhibit bit for the owner, as that is
+ * the classic single-user mode AmigaOS protection bit and we need to
+ * stay compatible with all scenarios.
+ *
+ * Since multi-user AmigaOS is an extension, we'll only set the
+ * delete-allow bit if any of the other bits in the same user class
+ * (group/other) are used.
+ */
+ prot &= ~(FIBF_NOEXECUTE | FIBF_NOREAD
+ | FIBF_NOWRITE | FIBF_NODELETE
+ | FIBF_GRP_EXECUTE | FIBF_GRP_READ
+ | FIBF_GRP_WRITE | FIBF_GRP_DELETE
+ | FIBF_OTR_EXECUTE | FIBF_OTR_READ
+ | FIBF_OTR_WRITE | FIBF_OTR_DELETE);
+
+ /* Classic single-user AmigaOS flags. These are inverted. */
if (!(mode & 0100))
prot |= FIBF_NOEXECUTE;
if (!(mode & 0400))
prot |= FIBF_NOREAD;
if (!(mode & 0200))
prot |= FIBF_NOWRITE;
+
+ /* Multi-user extended flags. Not inverted. */
if (mode & 0010)
prot |= FIBF_GRP_EXECUTE;
if (mode & 0040)
prot |= FIBF_GRP_READ;
if (mode & 0020)
prot |= FIBF_GRP_WRITE;
+ if (mode & 0070)
+ prot |= FIBF_GRP_DELETE;
+
if (mode & 0001)
prot |= FIBF_OTR_EXECUTE;
if (mode & 0004)
prot |= FIBF_OTR_READ;
if (mode & 0002)
prot |= FIBF_OTR_WRITE;
+ if (mode & 0007)
+ prot |= FIBF_OTR_DELETE;
AFFS_I(inode)->i_protect = prot;
}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a26a0f96c119..d91b0133d95d 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,6 +429,24 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
+static int affs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret;
+
+ ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ /* Clear Archived bit on file writes, as AmigaOS would do */
+ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
+ AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED;
+ mark_inode_dirty(inode);
+ }
+
+ return ret;
+}
+
static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
{
return generic_block_bmap(mapping,block,affs_get_block);
@@ -438,7 +456,7 @@ const struct address_space_operations affs_aops = {
.readpage = affs_readpage,
.writepage = affs_writepage,
.write_begin = affs_write_begin,
- .write_end = generic_write_end,
+ .write_end = affs_write_end,
.direct_IO = affs_direct_IO,
.bmap = _affs_bmap
};
@@ -795,6 +813,12 @@ done:
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+ /* Clear Archived bit on file writes, as AmigaOS would do */
+ if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
+ AFFS_I(inode)->i_protect &= ~FIBF_ARCHIVED;
+ mark_inode_dirty(inode);
+ }
+
err_first_bh:
unlock_page(page);
put_page(page);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 5d9ef517cf81..e7e98ad63a91 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -161,8 +161,8 @@ responded:
}
}
- rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
- if (rtt_us < server->probe.rtt) {
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 1d13d2e882ad..0fe8844b4bee 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -810,14 +810,32 @@ void afs_evict_inode(struct inode *inode)
static void afs_setattr_success(struct afs_operation *op)
{
- struct inode *inode = &op->file[0].vnode->vfs_inode;
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
+ loff_t old_i_size = i_size_read(inode);
+
+ op->setattr.old_i_size = old_i_size;
+ afs_vnode_commit_status(op, vp);
+ /* inode->i_size has now been changed. */
+
+ if (op->setattr.attr->ia_valid & ATTR_SIZE) {
+ loff_t size = op->setattr.attr->ia_size;
+ if (size > old_i_size)
+ pagecache_isize_extended(inode, old_i_size, size);
+ }
+}
+
+static void afs_setattr_edit_file(struct afs_operation *op)
+{
+ struct afs_vnode_param *vp = &op->file[0];
+ struct inode *inode = &vp->vnode->vfs_inode;
- afs_vnode_commit_status(op, &op->file[0]);
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
- loff_t i_size = inode->i_size, size = op->setattr.attr->ia_size;
- if (size > i_size)
- pagecache_isize_extended(inode, i_size, size);
- truncate_pagecache(inode, size);
+ loff_t size = op->setattr.attr->ia_size;
+ loff_t i_size = op->setattr.old_i_size;
+
+ if (size < i_size)
+ truncate_pagecache(inode, size);
}
}
@@ -825,6 +843,7 @@ static const struct afs_operation_ops afs_setattr_operation = {
.issue_afs_rpc = afs_fs_setattr,
.issue_yfs_rpc = yfs_fs_setattr,
.success = afs_setattr_success,
+ .edit_dir = afs_setattr_edit_file,
};
/*
@@ -863,11 +882,16 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
if (S_ISREG(vnode->vfs_inode.i_mode))
filemap_write_and_wait(vnode->vfs_inode.i_mapping);
+ /* Prevent any new writebacks from starting whilst we do this. */
+ down_write(&vnode->validate_lock);
+
op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ?
afs_file_key(attr->ia_file) : NULL),
vnode->volume);
- if (IS_ERR(op))
- return PTR_ERR(op);
+ if (IS_ERR(op)) {
+ ret = PTR_ERR(op);
+ goto out_unlock;
+ }
afs_op_set_vnode(op, 0, vnode);
op->setattr.attr = attr;
@@ -880,5 +904,10 @@ int afs_setattr(struct dentry *dentry, struct iattr *attr)
op->file[0].update_ctime = 1;
op->ops = &afs_setattr_operation;
- return afs_do_sync_operation(op);
+ ret = afs_do_sync_operation(op);
+
+out_unlock:
+ up_write(&vnode->validate_lock);
+ _leave(" = %d", ret);
+ return ret;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 792ac711985e..e5f0446f27e5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -401,22 +401,24 @@ struct afs_vlserver {
#define AFS_VLSERVER_FL_PROBED 0 /* The VL server has been probed */
#define AFS_VLSERVER_FL_PROBING 1 /* VL server is being probed */
#define AFS_VLSERVER_FL_IS_YFS 2 /* Server is YFS not AFS */
+#define AFS_VLSERVER_FL_RESPONDING 3 /* VL server is responding */
rwlock_t lock; /* Lock on addresses */
atomic_t usage;
+ unsigned int rtt; /* Server's current RTT in uS */
/* Probe state */
wait_queue_head_t probe_wq;
atomic_t probe_outstanding;
spinlock_t probe_lock;
struct {
- unsigned int rtt; /* RTT as ktime/64 */
+ unsigned int rtt; /* RTT in uS */
u32 abort_code;
short error;
- bool have_result;
- bool responded:1;
- bool is_yfs:1;
- bool not_yfs:1;
- bool local_failure:1;
+ unsigned short flags;
+#define AFS_VLSERVER_PROBE_RESPONDED 0x01 /* At least once response (may be abort) */
+#define AFS_VLSERVER_PROBE_IS_YFS 0x02 /* The peer appears to be YFS */
+#define AFS_VLSERVER_PROBE_NOT_YFS 0x04 /* The peer appears not to be YFS */
+#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */
} probe;
u16 port;
@@ -810,6 +812,7 @@ struct afs_operation {
} store;
struct {
struct iattr *attr;
+ loff_t old_i_size;
} setattr;
struct afs_acl *acl;
struct yfs_acl *yacl;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e817fc740ba0..e8babb62ed44 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -310,6 +310,11 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
alist->preferred == i ? '>' : '-',
&alist->addrs[i].transport);
}
+ seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
+ seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
+ vlserver->probe.flags, vlserver->probe.error,
+ vlserver->probe.abort_code,
+ atomic_read(&vlserver->probe_outstanding));
return 0;
}
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index 8fea54eba0c2..38b2ba1d9ec0 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -21,6 +21,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
+ vlserver->rtt = UINT_MAX;
vlserver->name_len = name_len;
vlserver->port = port;
memcpy(vlserver->name, name, name_len);
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index e3aa013c2177..d1c7068b4346 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -11,15 +11,33 @@
#include "internal.h"
#include "protocol_yfs.h"
-static bool afs_vl_probe_done(struct afs_vlserver *server)
+
+/*
+ * Handle the completion of a set of probes.
+ */
+static void afs_finished_vl_probe(struct afs_vlserver *server)
{
- if (!atomic_dec_and_test(&server->probe_outstanding))
- return false;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) {
+ server->rtt = UINT_MAX;
+ clear_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags);
+ }
- wake_up_var(&server->probe_outstanding);
clear_bit_unlock(AFS_VLSERVER_FL_PROBING, &server->flags);
wake_up_bit(&server->flags, AFS_VLSERVER_FL_PROBING);
- return true;
+}
+
+/*
+ * Handle the completion of a probe RPC call.
+ */
+static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up)
+{
+ if (atomic_dec_and_test(&server->probe_outstanding)) {
+ afs_finished_vl_probe(server);
+ wake_up = true;
+ }
+
+ if (wake_up)
+ wake_up_all(&server->probe_wq);
}
/*
@@ -45,15 +63,20 @@ void afs_vlserver_probe_result(struct afs_call *call)
server->probe.error = 0;
goto responded;
case -ECONNABORTED:
- if (!server->probe.responded) {
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)) {
server->probe.abort_code = call->abort_code;
server->probe.error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
- server->probe.local_failure = true;
- afs_io_error(call, afs_io_error_vl_probe_fail);
+ case -EKEYEXPIRED:
+ case -EKEYREVOKED:
+ case -EKEYREJECTED:
+ server->probe.flags |= AFS_VLSERVER_PROBE_LOCAL_FAILURE;
+ if (server->probe.error == 0)
+ server->probe.error = ret;
+ trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
case -ERFKILL:
@@ -67,12 +90,12 @@ void afs_vlserver_probe_result(struct afs_call *call)
default:
clear_bit(index, &alist->responded);
set_bit(index, &alist->failed);
- if (!server->probe.responded &&
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) &&
(server->probe.error == 0 ||
server->probe.error == -ETIMEDOUT ||
server->probe.error == -ETIME))
server->probe.error = ret;
- afs_io_error(call, afs_io_error_vl_probe_fail);
+ trace_afs_io_error(call->debug_id, ret, afs_io_error_vl_probe_fail);
goto out;
}
@@ -81,39 +104,36 @@ responded:
clear_bit(index, &alist->failed);
if (call->service_id == YFS_VL_SERVICE) {
- server->probe.is_yfs = true;
+ server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
} else {
- server->probe.not_yfs = true;
- if (!server->probe.is_yfs) {
+ server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
alist->addrs[index].srx_service = call->service_id;
}
}
- rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall);
- if (rtt_us < server->probe.rtt) {
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
+ server->rtt = rtt_us;
alist->preferred = index;
- have_result = true;
}
smp_wmb(); /* Set rtt before responded. */
- server->probe.responded = true;
+ server->probe.flags |= AFS_VLSERVER_PROBE_RESPONDED;
set_bit(AFS_VLSERVER_FL_PROBED, &server->flags);
+ set_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags);
+ have_result = true;
out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
server_index, index, &alist->addrs[index].transport, rtt_us, ret);
- have_result |= afs_vl_probe_done(server);
- if (have_result) {
- server->probe.have_result = true;
- wake_up_var(&server->probe.have_result);
- wake_up_all(&server->probe_wq);
- }
+ afs_done_one_vl_probe(server, have_result);
}
/*
@@ -151,11 +171,10 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
in_progress = true;
} else {
afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+ afs_done_one_vl_probe(server, false);
}
}
- if (!in_progress)
- afs_vl_probe_done(server);
return in_progress;
}
@@ -193,7 +212,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
{
struct wait_queue_entry *waits;
struct afs_vlserver *server;
- unsigned int rtt = UINT_MAX;
+ unsigned int rtt = UINT_MAX, rtt_s;
bool have_responders = false;
int pref = -1, i;
@@ -205,7 +224,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
server = vllist->servers[i].server;
if (!test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
__clear_bit(i, &untried);
- if (server->probe.responded)
+ if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)
have_responders = true;
}
}
@@ -231,7 +250,7 @@ int afs_wait_for_vl_probes(struct afs_vlserver_list *vllist,
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
- if (server->probe.responded)
+ if (server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED)
goto stop;
if (test_bit(AFS_VLSERVER_FL_PROBING, &server->flags))
still_probing = true;
@@ -249,10 +268,11 @@ stop:
for (i = 0; i < vllist->nr_servers; i++) {
if (test_bit(i, &untried)) {
server = vllist->servers[i].server;
- if (server->probe.responded &&
- server->probe.rtt < rtt) {
+ rtt_s = READ_ONCE(server->rtt);
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &server->flags) &&
+ rtt_s < rtt) {
pref = i;
- rtt = server->probe.rtt;
+ rtt = rtt_s;
}
remove_wait_queue(&server->probe_wq, &waits[i]);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index f405ca8b240a..c0458c903b31 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -192,7 +192,8 @@ pick_server:
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
- if (!test_bit(i, &vc->untried) || !s->probe.responded)
+ if (!test_bit(i, &vc->untried) ||
+ !test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
continue;
if (s->probe.rtt < rtt) {
vc->index = i;
@@ -262,10 +263,14 @@ no_more_servers:
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
+ e.responded = true;
afs_prioritise_error(&e, READ_ONCE(s->probe.error),
s->probe.abort_code);
}
+ error = e.error;
+
failed_set_error:
vc->error = error;
failed:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 4b2265cb1891..da12abd6db21 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -738,11 +738,21 @@ static int afs_writepages_region(struct address_space *mapping,
int afs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct afs_vnode *vnode = AFS_FS_I(mapping->host);
pgoff_t start, end, next;
int ret;
_enter("");
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ down_read(&vnode->validate_lock);
+ else if (!down_read_trylock(&vnode->validate_lock))
+ return 0;
+
if (wbc->range_cyclic) {
start = mapping->writeback_index;
end = -1;
@@ -762,6 +772,7 @@ int afs_writepages(struct address_space *mapping,
ret = afs_writepages_region(mapping, wbc, start, end, &next);
}
+ up_read(&vnode->validate_lock);
_leave(" = %d", ret);
return ret;
}
diff --git a/fs/aio.c b/fs/aio.c
index d5ec30385566..c45c20d87538 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1489,12 +1489,8 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
*iovec = NULL;
return ret;
}
-#ifdef CONFIG_COMPAT
- if (compat)
- return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
- iter);
-#endif
- return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
+
+ return __import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter, compat);
}
static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 74c886f7c51c..5ced859dac53 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -53,7 +53,7 @@ static int autofs_write(struct autofs_sb_info *sbi,
mutex_lock(&sbi->pipe_mutex);
while (bytes) {
- wr = kernel_write(file, data, bytes, &file->f_pos);
+ wr = __kernel_write(file, data, bytes, NULL);
if (wr <= 0)
break;
data += wr;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 575636f6491e..68b95ad82126 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,7 @@ config BTRFS_FS
select LZO_DECOMPRESS
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
+ select FS_IOMAP
select RAID6_PQ
select XOR_BLOCKS
select SRCU
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ea1c28ccb44f..b3268f4ea5f3 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
while (!list_empty(&pending_edge)) {
struct btrfs_backref_node *upper;
struct btrfs_backref_node *lower;
- struct rb_node *rb_node;
edge = list_first_entry(&pending_edge,
struct btrfs_backref_edge, list[UPPER]);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 613920c17ac1..c0f1d6818df7 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache)
{
struct btrfs_space_info *space_info = cache->space_info;
int index = btrfs_bg_flags_to_raid_index(cache->flags);
- bool first = false;
down_write(&space_info->groups_sem);
- if (list_empty(&space_info->block_groups[index]))
- first = true;
list_add_tail(&cache->list, &space_info->block_groups[index]);
up_write(&space_info->groups_sem);
-
- if (first)
- btrfs_sysfs_add_block_group_type(cache);
}
static struct btrfs_block_group *btrfs_create_block_group_cache(
@@ -1798,7 +1792,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->fs_info = fs_info;
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
- set_free_space_tree_thresholds(cache);
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
@@ -1874,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
return ret;
}
-static int read_block_group_item(struct btrfs_block_group *cache,
+static void read_block_group_item(struct btrfs_block_group *cache,
struct btrfs_path *path,
const struct btrfs_key *key)
{
@@ -1888,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache,
sizeof(bgi));
cache->used = btrfs_stack_block_group_used(&bgi);
cache->flags = btrfs_stack_block_group_flags(&bgi);
-
- return 0;
}
static int read_one_block_group(struct btrfs_fs_info *info,
@@ -1908,9 +1899,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
if (!cache)
return -ENOMEM;
- ret = read_block_group_item(cache, path, key);
- if (ret < 0)
- goto error;
+ read_block_group_item(cache, path, key);
+
+ set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -2034,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_release_path(path);
}
- rcu_read_lock();
- list_for_each_entry_rcu(space_info, &info->space_info, list) {
+ list_for_each_entry(space_info, &info->space_info, list) {
+ int i;
+
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ if (list_empty(&space_info->block_groups[i]))
+ continue;
+ cache = list_first_entry(&space_info->block_groups[i],
+ struct btrfs_block_group,
+ list);
+ btrfs_sysfs_add_block_group_type(cache);
+ }
+
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
(BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID1_MASK |
@@ -2055,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
list)
inc_block_group_ro(cache, 1);
}
- rcu_read_unlock();
btrfs_init_global_block_rsv(info);
ret = check_chunk_block_group_mappings(info);
@@ -2096,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
return;
while (!list_empty(&trans->new_bgs)) {
+ int index;
+
block_group = list_first_entry(&trans->new_bgs,
struct btrfs_block_group,
bg_list);
if (ret)
goto next;
+ index = btrfs_bg_flags_to_raid_index(block_group->flags);
+
ret = insert_block_group_item(trans, block_group);
if (ret)
btrfs_abort_transaction(trans, ret);
@@ -2110,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
if (ret)
btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, block_group);
+
+ /*
+ * If we restriped during balance, we may have added a new raid
+ * type, so now add the sysfs entries when it is safe to do so.
+ * We don't have to worry about locking here as it's handled in
+ * btrfs_sysfs_add_block_group_type.
+ */
+ if (block_group->space_info->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(block_group);
+
/* Already aborted the transaction if it failed. */
next:
btrfs_delayed_refs_rsv_release(fs_info, 1);
@@ -2132,6 +2146,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
return -ENOMEM;
cache->length = size;
+ set_free_space_tree_thresholds(cache);
cache->used = bytes_used;
cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
@@ -2783,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
@@ -2959,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
space_info, -ram_bytes);
if (delalloc)
cache->delalloc_bytes += num_bytes;
+
+ /*
+ * Compression can use less space than we reserved, so wake
+ * tickets if that happens
+ */
+ if (num_bytes < ram_bytes)
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -2992,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
if (delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
+
+ btrfs_try_granting_tickets(cache->fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -3000,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
+ list_for_each_entry(found, head, list) {
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
found->force_alloc = CHUNK_ALLOC_FORCE;
}
- rcu_read_unlock();
}
static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
@@ -3336,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->block_group_cache_lock);
- /*
- * Now that all the block groups are freed, go through and free all the
- * space_info structs. This is only called during the final stages of
- * unmount, and so we know nobody is using them. We call
- * synchronize_rcu() once before we start, just to be on the safe side.
- */
- synchronize_rcu();
-
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c47b6c6fea9f..92dd86bceae3 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -21,14 +21,18 @@
* new data the application may have written before commit.
*/
enum {
- BTRFS_INODE_ORDERED_DATA_CLOSE,
+ BTRFS_INODE_FLUSH_ON_CLOSE,
BTRFS_INODE_DUMMY,
BTRFS_INODE_IN_DEFRAG,
BTRFS_INODE_HAS_ASYNC_EXTENT,
+ /*
+ * Always set under the VFS' inode lock, otherwise it can cause races
+ * during fsync (we start as a fast fsync and then end up in a full
+ * fsync racing with ordered extent completion).
+ */
BTRFS_INODE_NEEDS_FULL_SYNC,
BTRFS_INODE_COPY_EVERYTHING,
BTRFS_INODE_IN_DELALLOC_LIST,
- BTRFS_INODE_READDIO_NEED_LOCK,
BTRFS_INODE_HAS_PROPS,
BTRFS_INODE_SNAPSHOT_FLUSH,
};
@@ -212,6 +216,11 @@ struct btrfs_inode {
struct inode vfs_inode;
};
+static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
+{
+ return inode->root->fs_info->sectorsize;
+}
+
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
@@ -324,23 +333,6 @@ struct btrfs_dio_private {
u8 csums[];
};
-/*
- * Disable DIO read nolock optimization, so new dio readers will be forced
- * to grab i_mutex. It is used to avoid the endless truncate due to
- * nonlocked dio read.
- */
-static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode)
-{
- set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
- smp_mb();
-}
-
-static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode)
-{
- smp_mb__before_atomic();
- clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags);
-}
-
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1ab56a734e70..eeface30facd 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -29,41 +29,6 @@
#include "extent_io.h"
#include "extent_map.h"
-int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *zlib_alloc_workspace(unsigned int level);
-void zlib_free_workspace(struct list_head *ws);
-struct list_head *zlib_get_workspace(unsigned int level);
-
-int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int lzo_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
-void lzo_free_workspace(struct list_head *ws);
-
-int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
- u64 start, struct page **pages, unsigned long *out_pages,
- unsigned long *total_in, unsigned long *total_out);
-int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
-int zstd_decompress(struct list_head *ws, unsigned char *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
- size_t destlen);
-void zstd_init_workspace_manager(void);
-void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
-void zstd_put_workspace(struct list_head *ws);
-
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 9f3dbe372631..8001b700ea3a 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len);
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
+int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zlib_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *zlib_alloc_workspace(unsigned int level);
+void zlib_free_workspace(struct list_head *ws);
+struct list_head *zlib_get_workspace(unsigned int level);
+
+int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+struct list_head *lzo_alloc_workspace(unsigned int level);
+void lzo_free_workspace(struct list_head *ws);
+
+int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
+ u64 start, struct page **pages, unsigned long *out_pages,
+ unsigned long *total_in, unsigned long *total_out);
+int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
+int zstd_decompress(struct list_head *ws, unsigned char *data_in,
+ struct page *dest_page, unsigned long start_byte, size_t srclen,
+ size_t destlen);
+void zstd_init_workspace_manager(void);
+void zstd_cleanup_workspace_manager(void);
+struct list_head *zstd_alloc_workspace(unsigned int level);
+void zstd_free_workspace(struct list_head *ws);
+struct list_head *zstd_get_workspace(unsigned int level);
+void zstd_put_workspace(struct list_head *ws);
+
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index cd1cd673bc0b..113da62dc17f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
btrfs_node_key(buf, &disk_key, 0);
cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
- &disk_key, level, buf->start, 0);
+ &disk_key, level, buf->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
const struct btrfs_disk_key *disk_key,
int level,
u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *ret;
@@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush(
ret = btrfs_alloc_tree_block(trans, root, parent_start,
root->root_key.objectid, disk_key, level,
- hint, empty_size);
+ hint, empty_size, nest);
trans->can_flush_pending_bgs = true;
return ret;
@@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size)
+ u64 search_start, u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_disk_key disk_key;
@@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = parent->start;
cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
- level, search_start, empty_size);
+ level, search_start, empty_size, nest);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = tree_mod_log_free_eb(buf);
if (ret) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1297,6 +1306,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1370,7 +1381,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (!eb)
return NULL;
- btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
@@ -1378,6 +1388,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
if (tm)
__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
@@ -1442,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret)
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 search_start;
@@ -1481,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
*/
btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0);
+ parent_slot, cow_ret, search_start, 0, nest);
trace_btrfs_cow_block(root, buf, *cow_ret);
@@ -1653,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize));
+ (end_slot - i) * blocksize),
+ BTRFS_NESTING_COW);
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -1851,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(child);
btrfs_set_lock_blocking_write(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(child);
free_extent_buffer(child);
@@ -1887,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
left = NULL;
if (left) {
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left);
+ parent, pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -1902,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
right = NULL;
if (right) {
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right);
+ parent, pslot + 1, &right,
+ BTRFS_NESTING_RIGHT_COW);
if (wret) {
ret = wret;
goto enospc;
@@ -2065,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (left) {
u32 left_nr;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
left_nr = btrfs_header_nritems(left);
@@ -2073,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left);
+ pslot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret)
wret = 1;
else {
@@ -2119,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
if (right) {
u32 right_nr;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
right_nr = btrfs_header_nritems(right);
@@ -2128,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right);
+ &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
wret = 1;
else {
@@ -2597,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
- b = btrfs_read_lock_root_node(root);
+ b = __btrfs_read_lock_root_node(root, p->recurse);
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
@@ -2736,11 +2755,13 @@ again:
btrfs_set_path_blocking(p);
if (last_level)
err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b);
+ &b,
+ BTRFS_NESTING_COW);
else
err = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1], &b);
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
if (err) {
ret = err;
goto done;
@@ -2871,7 +2892,8 @@ cow_done:
} else {
if (!btrfs_tree_read_lock_atomic(b)) {
btrfs_set_path_blocking(p);
- btrfs_tree_read_lock(b);
+ __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
+ p->recurse);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -3160,6 +3182,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
}
/*
+ * Check key order of two sibling extent buffers.
+ *
+ * Return true if something is wrong.
+ * Return false if everything is fine.
+ *
+ * Tree-checker only works inside one tree block, thus the following
+ * corruption can not be detected by tree-checker:
+ *
+ * Leaf @left | Leaf @right
+ * --------------------------------------------------------------
+ * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 |
+ *
+ * Key f6 in leaf @left itself is valid, but not valid when the next
+ * key in leaf @right is 7.
+ * This can only be checked at tree block merge time.
+ * And since tree checker has ensured all key order in each tree block
+ * is correct, we only need to bother the last key of @left and the first
+ * key of @right.
+ */
+static bool check_sibling_keys(struct extent_buffer *left,
+ struct extent_buffer *right)
+{
+ struct btrfs_key left_last;
+ struct btrfs_key right_first;
+ int level = btrfs_header_level(left);
+ int nr_left = btrfs_header_nritems(left);
+ int nr_right = btrfs_header_nritems(right);
+
+ /* No key to check in one of the tree blocks */
+ if (!nr_left || !nr_right)
+ return false;
+
+ if (level) {
+ btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_node_key_to_cpu(right, &right_first, 0);
+ } else {
+ btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
+ btrfs_item_key_to_cpu(right, &right_first, 0);
+ }
+
+ if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
+ btrfs_crit(left->fs_info,
+"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
+ left_last.objectid, left_last.type,
+ left_last.offset, right_first.objectid,
+ right_first.type, right_first.offset);
+ return true;
+ }
+ return false;
+}
+
+/*
* try to push data from one node into the next node left in the
* tree.
*
@@ -3203,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
+ /* dst is the left eb, src is the middle eb */
+ if (check_sibling_keys(dst, src)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -3271,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
if (max_push < push_items)
push_items = max_push;
+ /* dst is the right eb, src is the middle eb */
+ if (check_sibling_keys(src, dst)) {
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
@@ -3327,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_node_key(lower, &lower_key, 0);
c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
- root->node->start, 0);
+ root->node->start, 0,
+ BTRFS_NESTING_NEW_ROOT);
if (IS_ERR(c))
return PTR_ERR(c);
@@ -3457,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_node_key(c, &disk_key, mid);
split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
- c->start, 0);
+ c->start, 0, BTRFS_NESTING_SPLIT);
if (IS_ERR(split))
return PTR_ERR(split);
@@ -3726,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(right))
return 1;
- btrfs_tree_lock(right);
+ __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
btrfs_set_lock_blocking_write(right);
free_space = btrfs_leaf_free_space(right);
@@ -3735,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right);
+ slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
if (ret)
goto out_unlock;
@@ -3747,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return ret;
+ }
if (path->slots[0] == left_nritems && !empty) {
/* Key greater than all keys in the leaf, right neighbor has
* enough room for it and we're not emptying our leaf to delete
@@ -3959,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
if (IS_ERR(left))
return 1;
- btrfs_tree_lock(left);
+ __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
btrfs_set_lock_blocking_write(left);
free_space = btrfs_leaf_free_space(left);
@@ -3970,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
/* cow and double check */
ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left);
+ path->nodes[1], slot - 1, &left,
+ BTRFS_NESTING_LEFT_COW);
if (ret) {
/* we hit -ENOSPC, but it isn't fatal here */
if (ret == -ENOSPC)
@@ -3984,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
goto out;
}
+ if (check_sibling_keys(left, right)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
return __push_leaf_left(path, min_data_size,
empty, left, free_space, right_nritems,
max_slot);
@@ -4232,8 +4330,18 @@ again:
else
btrfs_item_key(l, &disk_key, mid);
+ /*
+ * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
+ * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
+ * subclasses, which is 8 at the time of this patch, and we've maxed it
+ * out. In the future we could add a
+ * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
+ * use BTRFS_NESTING_NEW_ROOT.
+ */
right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
- l->start, 0);
+ l->start, 0, num_doubles ?
+ BTRFS_NESTING_NEW_ROOT :
+ BTRFS_NESTING_SPLIT);
if (IS_ERR(right))
return PTR_ERR(right);
@@ -4478,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
return ret;
path->slots[0]++;
- setup_items_for_insert(root, path, new_key, &item_size,
- item_size, item_size +
- sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, new_key, &item_size, 1);
leaf = path->nodes[0];
memcpy_extent_buffer(leaf,
btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -4653,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
-/*
- * this is a helper for btrfs_insert_empty_items, the main goal here is
- * to save stack depth by doing the bulk of the work in a function
- * that doesn't call btrfs_search_slot
+/**
+ * setup_items_for_insert - Helper called before inserting one or more items
+ * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
+ * in a function that doesn't call btrfs_search_slot
+ *
+ * @root: root we are inserting items to
+ * @path: points to the leaf/slot where we are going to insert new items
+ * @cpu_key: array of keys for items to be inserted
+ * @data_size: size of the body of each item we are going to insert
+ * @nr: size of @cpu_key/@data_size arrays
*/
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr)
+ int nr)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_item *item;
@@ -4671,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
struct extent_buffer *leaf;
int slot;
struct btrfs_map_token token;
+ u32 total_size;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
if (path->slots[0] == 0) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
@@ -4697,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (old_data < data_end) {
btrfs_print_leaf(leaf);
- btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
+ btrfs_crit(fs_info,
+ "item at slot %d with data offset %u beyond data end of leaf %u",
slot, old_data, data_end);
BUG();
}
@@ -4730,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
item = btrfs_item_nr(slot + i);
- btrfs_set_token_item_offset(&token, item, data_end - data_size[i]);
data_end -= data_size[i];
+ btrfs_set_token_item_offset(&token, item, data_end);
btrfs_set_token_item_size(&token, item, data_size[i]);
}
@@ -4773,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
slot = path->slots[0];
BUG_ON(slot < 0);
- setup_items_for_insert(root, path, cpu_key, data_size,
- total_data, total_size, nr);
+ setup_items_for_insert(root, path, cpu_key, data_size, nr);
return 0;
}
@@ -5111,7 +5229,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
@@ -5375,7 +5493,9 @@ again:
}
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5410,7 +5530,9 @@ again:
ret = btrfs_try_tree_read_lock(next);
if (!ret) {
btrfs_set_path_blocking(path);
- btrfs_tree_read_lock(next);
+ __btrfs_tree_read_lock(next,
+ BTRFS_NESTING_RIGHT,
+ path->recurse);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9a72896bed2e..aac3d6f4e35b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -374,6 +374,7 @@ struct btrfs_path {
unsigned int search_commit_root:1;
unsigned int need_commit_sem:1;
unsigned int skip_release_on_error:1;
+ unsigned int recurse:1;
};
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
@@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_DONE = 2,
};
-void btrfs_init_async_reclaim_work(struct work_struct *work);
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
/* fs_info */
struct reloc_control;
@@ -541,11 +542,6 @@ enum {
/* Used to record internally whether fs has been frozen */
BTRFS_FS_FROZEN,
/*
- * Indicate that a whole-filesystem exclusive operation is running
- * (device replace, resize, device add/delete, balance)
- */
- BTRFS_FS_EXCL_OP,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -565,6 +561,19 @@ enum {
BTRFS_FS_DISCARD_RUNNING,
};
+/*
+ * Exclusive operations (device replace, resize, device add/remove, balance)
+ */
+enum btrfs_exclusive_operation {
+ BTRFS_EXCLOP_NONE,
+ BTRFS_EXCLOP_BALANCE,
+ BTRFS_EXCLOP_DEV_ADD,
+ BTRFS_EXCLOP_DEV_REMOVE,
+ BTRFS_EXCLOP_DEV_REPLACE,
+ BTRFS_EXCLOP_RESIZE,
+ BTRFS_EXCLOP_SWAP_ACTIVATE,
+};
+
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
@@ -912,6 +921,7 @@ struct btrfs_fs_info {
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
+ struct work_struct async_data_reclaim_work;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
@@ -935,6 +945,9 @@ struct btrfs_fs_info {
*/
int send_in_progress;
+ /* Type of exclusive operation running */
+ unsigned long exclusive_operation;
+
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
@@ -1181,24 +1194,40 @@ struct btrfs_root {
#endif
};
-struct btrfs_clone_extent_info {
+/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+struct btrfs_replace_extent_info {
u64 disk_offset;
u64 disk_len;
u64 data_offset;
u64 data_len;
u64 file_offset;
+ /* Pointer to a file extent item of type regular or prealloc. */
char *extent_buf;
- u32 item_size;
+ /*
+ * Set to true when attempting to replace a file range with a new extent
+ * described by this structure, set to false when attempting to clone an
+ * existing extent into a file range.
+ */
+ bool is_new_extent;
+ /* Meaningful only if is_new_extent is true. */
+ int qgroup_reserved;
+ /*
+ * Meaningful only if is_new_extent is true.
+ * Used to track how many extent items we have already inserted in a
+ * subvolume tree that refer to the extent described by this structure,
+ * so that we know when to create a new delayed ref or update an existing
+ * one.
+ */
+ int insertions;
};
struct btrfs_file_private {
void *filldir_buf;
};
-static inline u32 btrfs_inode_sectorsize(const struct inode *inode)
-{
- return btrfs_sb(inode->i_sb)->sectorsize;
-}
static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info)
{
@@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token,
#define cpu_to_le8(v) (v)
#define __le8 u8
+static inline u8 get_unaligned_le8(const void *p)
+{
+ return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+ *(u8 *)p = val;
+}
+
#define read_eb_member(eb, ptr, type, member, result) (\
read_extent_buffer(eb, (char *)(result), \
((unsigned long)(ptr)) + \
@@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
const type *p = page_address(eb->pages[0]); \
- u##bits res = le##bits##_to_cpu(p->member); \
- return res; \
+ return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
type *p = page_address(eb->pages[0]); \
- p->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &p->member); \
}
#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const type *s) \
{ \
- return le##bits##_to_cpu(s->member); \
+ return get_unaligned_le##bits(&s->member); \
} \
static inline void btrfs_set_##name(type *s, u##bits val) \
{ \
- s->member = cpu_to_le##bits(val); \
+ put_unaligned_le##bits(val, &s->member); \
}
-
static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb,
struct btrfs_dev_item *s)
{
@@ -2524,7 +2561,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size);
+ u64 empty_size,
+ enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum {
*
* Can be interruped by fatal signal.
*/
+ BTRFS_RESERVE_FLUSH_DATA,
+ BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
BTRFS_RESERVE_FLUSH_ALL,
/*
@@ -2619,7 +2659,7 @@ enum btrfs_flush_state {
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
@@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
@@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret);
+ struct extent_buffer **cow_ret,
+ enum btrfs_lock_nesting nest);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
@@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_key *cpu_key, u32 *data_size,
- u32 total_data, u32 total_size, int nr);
+ int nr);
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key, void *data, u32 data_size);
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2930,6 +2969,10 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
@@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end);
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate);
extern const struct dentry_operations btrfs_dentry_operations;
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type);
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
/* file.c */
int __init btrfs_auto_defrag_init(void);
@@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode, u64 start,
u64 end, int drop_cache);
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
@@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 0e354e9e57d0..bacee09b7bfd 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
- u64 used;
- int ret = 0;
- int need_commit = 2;
- int have_pinned_space;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
/* Make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, fs_info->sectorsize);
- if (btrfs_is_free_space_inode(inode)) {
- need_commit = 0;
- ASSERT(current->journal_info);
- }
-
-again:
- /* Make sure we have enough space to handle the data first */
- spin_lock(&data_sinfo->lock);
- used = btrfs_space_info_used(data_sinfo, true);
-
- if (used + bytes > data_sinfo->total_bytes) {
- struct btrfs_trans_handle *trans;
-
- /*
- * If we don't have enough free bytes in this space then we need
- * to alloc a new chunk.
- */
- if (!data_sinfo->full) {
- u64 alloc_target;
-
- data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
- spin_unlock(&data_sinfo->lock);
-
- alloc_target = btrfs_data_alloc_profile(fs_info);
- /*
- * It is ugly that we don't call nolock join
- * transaction for the free space inode case here.
- * But it is safe because we only do the data space
- * reservation for the free space cache in the
- * transaction context, the common join transaction
- * just increase the counter of the current transaction
- * handler, doesn't try to acquire the trans_lock of
- * the fs.
- */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_chunk_alloc(trans, alloc_target,
- CHUNK_ALLOC_NO_FORCE);
- btrfs_end_transaction(trans);
- if (ret < 0) {
- if (ret != -ENOSPC)
- return ret;
- else {
- have_pinned_space = 1;
- goto commit_trans;
- }
- }
-
- goto again;
- }
-
- /*
- * If we don't have enough pinned space to deal with this
- * allocation, and no removed chunk in current transaction,
- * don't bother committing the transaction.
- */
- have_pinned_space = __percpu_counter_compare(
- &data_sinfo->total_bytes_pinned,
- used + bytes - data_sinfo->total_bytes,
- BTRFS_TOTAL_BYTES_PINNED_BATCH);
- spin_unlock(&data_sinfo->lock);
-
- /* Commit the current transaction and try again */
-commit_trans:
- if (need_commit) {
- need_commit--;
-
- if (need_commit > 0) {
- btrfs_start_delalloc_roots(fs_info, -1);
- btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
- (u64)-1);
- }
-
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- if (have_pinned_space >= 0 ||
- test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
- &trans->transaction->flags) ||
- need_commit > 0) {
- ret = btrfs_commit_transaction(trans);
- if (ret)
- return ret;
- /*
- * The cleaner kthread might still be doing iput
- * operations. Wait for it to finish so that
- * more space is released. We don't need to
- * explicitly run the delayed iputs here because
- * the commit_transaction would have woken up
- * the cleaner.
- */
- ret = btrfs_wait_on_delayed_iputs(fs_info);
- if (ret)
- return ret;
- goto again;
- } else {
- btrfs_end_transaction(trans);
- }
- }
-
- trace_btrfs_space_reservation(fs_info,
- "space_info:enospc",
- data_sinfo->flags, bytes, 1);
- return -ENOSPC;
- }
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes);
- spin_unlock(&data_sinfo->lock);
+ if (btrfs_is_free_space_inode(inode))
+ flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return 0;
+ return btrfs_reserve_data_bytes(fs_info, bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
@@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
data_sinfo = fs_info->data_sinfo;
- spin_lock(&data_sinfo->lock);
- btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len);
- spin_unlock(&data_sinfo->lock);
+ btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
}
/*
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bf1595a42a98..5aba81e16113 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root,
- fs_info->nodesize, true);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
@@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
}
/* insert the keys of the items */
- setup_items_for_insert(root, path, keys, data_size,
- total_data_size, total_size, nitems);
+ setup_items_for_insert(root, path, keys, data_size, nitems);
/* insert the dir index items */
slot = path->slots[0];
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index db93909b25e0..4a0243cb9d97 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -64,10 +64,6 @@
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
@@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
{
struct btrfs_device *device;
struct block_device *bdev;
- struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
- if (fs_info->fs_devices->seeding) {
+ if (srcdev->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
@@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
sync_blockdev(bdev);
- devices = &fs_info->fs_devices->devices;
- list_for_each_entry(device, devices, dev_list) {
+ list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
@@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
- ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device);
+ ret = btrfs_sysfs_add_device(tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
@@ -599,6 +593,63 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->dev_replace.replace_wait);
}
+/*
+ * When finishing the device replace, before swapping the source device with the
+ * target device we must update the chunk allocation state in the target device,
+ * as it is empty because replace works by directly copying the chunks and not
+ * through the normal chunk allocation path.
+ */
+static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 found_start;
+ u64 found_end;
+ int ret = 0;
+
+ lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
+
+ while (!find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = set_extent_bits(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED);
+ if (ret)
+ break;
+ start = found_end + 1;
+ }
+
+ free_extent_state(cached_state);
+ return ret;
+}
+
+static void btrfs_dev_replace_update_device_in_mapping_tree(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 start = 0;
+ int i;
+
+ write_lock(&em_tree->lock);
+ do {
+ em = lookup_extent_mapping(em_tree, start, (u64)-1);
+ if (!em)
+ break;
+ map = em->map_lookup;
+ for (i = 0; i < map->num_stripes; i++)
+ if (srcdev == map->stripes[i].dev)
+ map->stripes[i].dev = tgtdev;
+ start = em->start + em->len;
+ free_extent_map(em);
+ } while (start);
+ write_unlock(&em_tree->lock);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -630,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
@@ -673,8 +724,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
- /* replace old device with new one in mapping tree */
+ /*
+ * Update allocation state in the new device and replace the old device
+ * with the new one in the mapping tree.
+ */
if (!scrub_ret) {
+ scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
+ if (scrub_ret)
+ goto error;
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
@@ -685,6 +742,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
+error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -743,9 +801,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
/* replace the sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device);
+ btrfs_sysfs_remove_device(src_device);
btrfs_sysfs_update_devid(tgt_device);
- btrfs_rm_dev_replace_free_srcdev(src_device);
+ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
+ btrfs_scratch_superblocks(fs_info, src_device->bdev,
+ src_device->name->str);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -754,33 +814,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return 0;
-}
-
-static void btrfs_dev_replace_update_device_in_mapping_tree(
- struct btrfs_fs_info *fs_info,
- struct btrfs_device *srcdev,
- struct btrfs_device *tgtdev)
-{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
- u64 start = 0;
- int i;
+ btrfs_rm_dev_replace_free_srcdev(src_device);
- write_lock(&em_tree->lock);
- do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
- break;
- map = em->map_lookup;
- for (i = 0; i < map->num_stripes; i++)
- if (srcdev == map->stripes[i].dev)
- map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
- } while (start);
- write_unlock(&em_tree->lock);
+ return 0;
}
/*
@@ -983,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
@@ -1020,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data)
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return 0;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e24927bddd58..8e3438672a82 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -50,7 +50,6 @@
BTRFS_SUPER_FLAG_METADUMP |\
BTRFS_SUPER_FLAG_METADUMP_V2)
-static const struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -205,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
#endif
/*
- * extents on the btree inode are pretty simple, there's one extent
- * that covers the entire device
- */
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len)
-{
- struct extent_map_tree *em_tree = &inode->extent_tree;
- struct extent_map *em;
- int ret;
-
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
- if (em) {
- read_unlock(&em_tree->lock);
- goto out;
- }
- read_unlock(&em_tree->lock);
-
- em = alloc_extent_map();
- if (!em) {
- em = ERR_PTR(-ENOMEM);
- goto out;
- }
- em->start = 0;
- em->len = (u64)-1;
- em->block_len = (u64)-1;
- em->block_start = 0;
-
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
- if (ret == -EEXIST) {
- free_extent_map(em);
- em = lookup_extent_mapping(em_tree, start, len);
- if (!em)
- em = ERR_PTR(-EIO);
- } else if (ret) {
- free_extent_map(em);
- em = ERR_PTR(ret);
- }
- write_unlock(&em_tree->lock);
-
-out:
- return em;
-}
-
-/*
* Compute the csum of a btree block and store the result to provided buffer.
*/
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
@@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
static int check_tree_block_fsid(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
u8 fsid[BTRFS_FSID_SIZE];
- int ret = 1;
+ u8 *metadata_uuid;
read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
BTRFS_FSID_SIZE);
- while (fs_devices) {
- u8 *metadata_uuid;
+ /*
+ * Checking the incompat flag is only valid for the current fs. For
+ * seed devices it's forbidden to have their uuid changed so reading
+ * ->fsid in this case is fine
+ */
+ if (btrfs_fs_incompat(fs_info, METADATA_UUID))
+ metadata_uuid = fs_devices->metadata_uuid;
+ else
+ metadata_uuid = fs_devices->fsid;
- /*
- * Checking the incompat flag is only valid for the current
- * fs. For seed devices it's forbidden to have their uuid
- * changed so reading ->fsid in this case is fine
- */
- if (fs_devices == fs_info->fs_devices &&
- btrfs_fs_incompat(fs_info, METADATA_UUID))
- metadata_uuid = fs_devices->metadata_uuid;
- else
- metadata_uuid = fs_devices->fsid;
+ if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
+ return 0;
- if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
- ret = 0;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- return ret;
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
+ return 0;
+
+ return 1;
}
-static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror)
{
u64 found_start;
int found_level;
@@ -636,16 +585,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
csum_tree_block(eb, result);
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u32 val;
- u32 found = 0;
-
- memcpy(&found, result, csum_size);
+ u8 val[BTRFS_CSUM_SIZE] = { 0 };
read_extent_buffer(eb, &val, 0, csum_size);
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted %x found %x level %d",
+ "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
fs_info->sb->s_id, eb->start,
- val, found, btrfs_header_level(eb));
+ CSUM_FMT_VALUE(csum_size, val),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
ret = -EUCLEAN;
goto err;
}
@@ -865,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info,
return 1;
}
-static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int async = check_async_write(fs_info, BTRFS_I(inode));
@@ -952,11 +899,6 @@ static int btree_writepages(struct address_space *mapping,
return btree_write_cache_pages(mapping, wbc);
}
-static int btree_readpage(struct file *file, struct page *page)
-{
- return extent_read_full_page(page, btree_get_extent, 0);
-}
-
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{
if (PageWriteback(page) || PageDirty(page))
@@ -996,7 +938,6 @@ static int btree_set_page_dirty(struct page *page)
}
static const struct address_space_operations btree_aops = {
- .readpage = btree_readpage,
.writepages = btree_writepages,
.releasepage = btree_releasepage,
.invalidatepage = btree_invalidatepage,
@@ -1209,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = 0;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
@@ -1281,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
*/
leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
- NULL, 0, 0, 0);
+ NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
btrfs_put_root(root);
return ERR_CAST(leaf);
@@ -1506,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *root;
while (!list_empty(&fs_info->allocated_roots)) {
+ char buf[BTRFS_ROOT_NAME_BUF_LEN];
+
root = list_first_entry(&fs_info->allocated_roots,
struct btrfs_root, leak_list);
- btrfs_err(fs_info, "leaked root %llu-%llu refcount %d",
- root->root_key.objectid, root->root_key.offset,
+ btrfs_err(fs_info, "leaked root %s refcount %d",
+ btrfs_root_name(root->root_key.objectid, buf),
refcount_read(&root->refs));
while (refcount_read(&root->refs) > 1)
btrfs_put_root(root);
@@ -2116,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_INODE_IO, inode);
+ IO_TREE_BTREE_INODE_IO, inode);
BTRFS_I(inode)->io_tree.track_uptodate = false;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
- BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
-
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -2627,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
level = btrfs_super_root_level(sb);
tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
generation, level, NULL);
- if (IS_ERR(tree_root->node) ||
- !extent_buffer_uptodate(tree_root->node)) {
+ if (IS_ERR(tree_root->node)) {
handle_error = true;
+ ret = PTR_ERR(tree_root->node);
+ tree_root->node = NULL;
+ btrfs_warn(fs_info, "couldn't read tree root");
+ continue;
- if (IS_ERR(tree_root->node)) {
- ret = PTR_ERR(tree_root->node);
- tree_root->node = NULL;
- } else if (!extent_buffer_uptodate(tree_root->node)) {
- ret = -EUCLEAN;
- }
-
- btrfs_warn(fs_info, "failed to read tree root");
+ } else if (!extent_buffer_uptodate(tree_root->node)) {
+ handle_error = true;
+ ret = -EIO;
+ btrfs_warn(fs_info, "error while reading tree root");
continue;
}
@@ -2754,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->check_integrity_print_mask = 0;
#endif
btrfs_init_balance(fs_info);
- btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
+ btrfs_init_async_reclaim_work(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2929,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
- * Verify the type first, if that or the the checksum value are
+ * Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type(disk_super);
@@ -3416,6 +3357,8 @@ fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
+ if (fs_info->data_reloc_root)
+ btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
@@ -3479,8 +3422,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
return ERR_CAST(page);
super = page_address(page);
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
+ if (btrfs_super_magic(super) != BTRFS_MAGIC) {
+ btrfs_release_disk_super(super);
+ return ERR_PTR(-ENODATA);
+ }
+
+ if (btrfs_super_bytenr(super) != bytenr) {
btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
@@ -4053,6 +4000,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4684,9 +4632,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
return 0;
}
-
-static const struct extent_io_ops btree_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btree_submit_bio_hook,
- .readpage_end_io_hook = btree_readpage_end_io_hook,
-};
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 00dc39d47ed3..fee69ced58b4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info);
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-
+int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end,
+ int mirror);
+blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
-struct extent_map *btree_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 len);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int __init btrfs_end_io_wq_init(void);
void __cold btrfs_end_io_wq_exit(void);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 219a09a2b734..9800a8306368 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -40,6 +40,7 @@ struct io_failure_record;
enum {
IO_TREE_FS_PINNED_EXTENTS,
IO_TREE_FS_EXCLUDED_EXTENTS,
+ IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
@@ -48,6 +49,7 @@ enum {
IO_TREE_INODE_FILE_EXTENT,
IO_TREE_LOG_CSUM_RANGE,
IO_TREE_SELFTEST,
+ IO_TREE_DEVICE_ALLOC_STATE,
};
struct extent_io_tree {
@@ -61,7 +63,6 @@ struct extent_io_tree {
u8 owner;
spinlock_t lock;
- const struct extent_io_ops *ops;
};
struct extent_state {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5871ef78edba..3b21fee13e77 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -400,12 +400,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
@@ -414,12 +413,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_DATA_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else {
@@ -429,8 +427,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
btrfs_print_leaf((struct extent_buffer *)eb);
- btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
- eb->start, type);
+ btrfs_err(eb->fs_info,
+ "eb %llu iref 0x%lx invalid extent inline ref type %d",
+ eb->start, (unsigned long)iref, type);
WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
@@ -1178,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
num_bytes, parent, root_objectid,
owner, offset, 1);
if (ret == 0) {
- BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+ /*
+ * We're adding refs to a tree block we already own, this
+ * should not happen at all.
+ */
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_crit(trans->fs_info,
+"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
+ bytenr, num_bytes, root_objectid);
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ WARN_ON(1);
+ btrfs_crit(trans->fs_info,
+ "path->slots[0]=%d path->nodes[0]:", path->slots[0]);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+ return -EUCLEAN;
+ }
update_inline_extent_backref(path, iref, refs_to_add,
extent_op, NULL);
} else if (ret == -ENOENT) {
@@ -1398,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
/*
* __btrfs_inc_extent_ref - insert backreference for a given extent
*
+ * The counterpart is in __btrfs_free_extent(), with examples and more details
+ * how it works.
+ *
* @trans: Handle of transaction
*
* @node: The delayed ref node used to get the bytenr/length for
@@ -2850,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len -= to_add;
}
spin_unlock(&global_rsv->lock);
- /* Add to any tickets we may have */
- if (len)
- btrfs_try_granting_tickets(fs_info,
- space_info);
}
+ /* Add to any tickets we may have */
+ if (!readonly && return_free_space && len)
+ btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
@@ -2936,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
return 0;
}
+/*
+ * Drop one or more refs of @node.
+ *
+ * 1. Locate the extent refs.
+ * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
+ * Locate it, then reduce the refs number or remove the ref line completely.
+ *
+ * 2. Update the refs count in EXTENT/METADATA_ITEM
+ *
+ * Inline backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 2 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ * extent data backref root FS_TREE objectid 257 offset 0 count 1
+ *
+ * This function gets called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 257
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
+ * refs 1 gen 6 flags DATA
+ * extent data backref root FS_TREE objectid 258 offset 0 count 1
+ *
+ * Keyed backref case:
+ *
+ * in extent tree we have:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 754 gen 6 flags DATA
+ * [...]
+ * item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
+ * extent data backref root FS_TREE objectid 866 offset 0 count 1
+ *
+ * This function get called with:
+ *
+ * node->bytenr = 13631488
+ * node->num_bytes = 1048576
+ * root_objectid = FS_TREE
+ * owner_objectid = 866
+ * owner_offset = 0
+ * refs_to_drop = 1
+ *
+ * Then we should get some like:
+ *
+ * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
+ * refs 753 gen 6 flags DATA
+ *
+ * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
+ */
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *node, u64 parent,
u64 root_objectid, u64 owner_objectid,
@@ -2968,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
- BUG_ON(!is_data && refs_to_drop != 1);
+
+ if (!is_data && refs_to_drop != 1) {
+ btrfs_crit(info,
+"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
+ node->bytenr, refs_to_drop);
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
if (is_data)
skinny_metadata = false;
@@ -2977,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
parent, root_objectid, owner_objectid,
owner_offset);
if (ret == 0) {
+ /*
+ * Either the inline backref or the SHARED_DATA_REF/
+ * SHARED_BLOCK_REF is found
+ *
+ * Here is a quick path to locate EXTENT/METADATA_ITEM.
+ * It's possible the EXTENT/METADATA_ITEM is near current slot.
+ */
extent_slot = path->slots[0];
while (extent_slot >= 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -2993,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
found_extent = 1;
break;
}
+
+ /* Quick path didn't find the EXTEMT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
}
if (!found_extent) {
- BUG_ON(iref);
+ if (iref) {
+ btrfs_crit(info,
+"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
+ /* Must be SHARED_* item, remove the backref first */
ret = remove_extent_backref(trans, path, NULL,
refs_to_drop,
is_data, &last_ref);
@@ -3010,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
path->leave_spinning = 1;
+ /* Slow path to locate EXTENT/METADATA_ITEM */
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
@@ -3084,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
key.type == BTRFS_EXTENT_ITEM_KEY) {
struct btrfs_tree_block_info *bi;
- BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+ if (item_size < sizeof(*ei) + sizeof(*bi)) {
+ btrfs_crit(info,
+"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu",
+ key.objectid, key.type, key.offset,
+ owner_objectid, item_size,
+ sizeof(*ei) + sizeof(*bi));
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
bi = (struct btrfs_tree_block_info *)(ei + 1);
WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
}
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
- btrfs_err(info,
- "trying to drop %d refs but we only have %Lu for bytenr %Lu",
+ btrfs_crit(info,
+ "trying to drop %d refs but we only have %llu for bytenr %llu",
refs_to_drop, refs, bytenr);
- ret = -EINVAL;
- btrfs_abort_transaction(trans, ret);
- goto out;
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
}
refs -= refs_to_drop;
@@ -3108,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
* be updated by remove_extent_backref
*/
if (iref) {
- BUG_ON(!found_extent);
+ if (!found_extent) {
+ btrfs_crit(info,
+"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
@@ -3123,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
} else {
+ /* In this branch refs == 1 */
if (found_extent) {
- BUG_ON(is_data && refs_to_drop !=
- extent_data_ref_count(path, iref));
+ if (is_data && refs_to_drop !=
+ extent_data_ref_count(path, iref)) {
+ btrfs_crit(info,
+ "invalid refs_to_drop, current refs %u refs_to_drop %u",
+ extent_data_ref_count(path, iref),
+ refs_to_drop);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
if (iref) {
- BUG_ON(path->slots[0] != extent_slot);
+ if (path->slots[0] != extent_slot) {
+ btrfs_crit(info,
+"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
+ key.objectid, key.type,
+ key.offset);
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
} else {
- BUG_ON(path->slots[0] != extent_slot + 1);
+ /*
+ * No inline ref, we must be at SHARED_* item,
+ * And it's single ref, it must be:
+ * | extent_slot ||extent_slot + 1|
+ * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
+ */
+ if (path->slots[0] != extent_slot + 1) {
+ btrfs_crit(info,
+ "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
+ btrfs_abort_transaction(trans, -EUCLEAN);
+ goto err_dump;
+ }
path->slots[0] = extent_slot;
num_to_del = 2;
}
@@ -3170,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
return ret;
+err_dump:
+ /*
+ * Leaf dump can take up a lot of log buffer, so we only do full leaf
+ * dump for debug build.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
+ btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
+ path->slots[0], extent_slot);
+ btrfs_print_leaf(path->nodes[0]);
+ }
+
+ btrfs_free_path(path);
+ return -EUCLEAN;
}
/*
@@ -3919,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
* |- Push harder to find free extents
* |- If not found, re-iterate all block groups
*/
-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
+static noinline int find_free_extent(struct btrfs_root *root,
u64 ram_bytes, u64 num_bytes, u64 empty_size,
u64 hint_byte_orig, struct btrfs_key *ins,
u64 flags, int delalloc)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
int cache_block_group_error = 0;
struct btrfs_block_group *block_group = NULL;
@@ -3955,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
+ trace_find_free_extent(root, num_bytes, empty_size, flags);
space_info = btrfs_find_space_info(fs_info, flags);
if (!space_info) {
@@ -4204,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
flags = get_alloc_profile_by_root(root, is_data);
again:
WARN_ON(num_bytes < fs_info->sectorsize);
- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(fs_info, ins->objectid);
@@ -4505,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, int level, u64 owner)
+ u64 bytenr, int level, u64 owner,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *buf;
@@ -4527,8 +4679,8 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
- btrfs_tree_lock(buf);
+ btrfs_set_buffer_lockdep_class(owner, buf, level);
+ __btrfs_tree_lock(buf, nest);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -4574,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
- u64 empty_size)
+ u64 empty_size,
+ enum btrfs_lock_nesting nest)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key ins;
@@ -4590,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (btrfs_is_testing(fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- level, root_objectid);
+ level, root_objectid, nest);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -4607,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
goto out_unuse;
buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
- root_objectid);
+ root_objectid, nest);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_free_reserved;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 6def411b2eba..60f5f68d892d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits,
return ret;
}
-static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
- unsigned long bio_flags)
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags)
{
blk_status_t ret = 0;
struct extent_io_tree *tree = bio->bi_private;
bio->bi_private = NULL;
- if (tree->ops)
- ret = tree->ops->submit_bio_hook(tree->private_data, bio,
- mirror_num, bio_flags);
+ if (is_data_inode(tree->private_data))
+ ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num,
+ bio_flags);
else
- btrfsic_submit_bio(bio);
+ ret = btrfs_submit_metadata_bio(tree->private_data, bio,
+ mirror_num, bio_flags);
return blk_status_to_errno(ret);
}
@@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
- tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
@@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio)
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- bool data_inode = btrfs_ino(BTRFS_I(inode))
- != BTRFS_BTREE_INODE_OBJECTID;
btrfs_debug(fs_info,
"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
@@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio)
mirror = io_bio->mirror_num;
if (likely(uptodate)) {
- ret = tree->ops->readpage_end_io_hook(io_bio, offset,
- page, start, end,
- mirror);
+ if (is_data_inode(inode))
+ ret = btrfs_verify_data_csum(io_bio, offset, page,
+ start, end, mirror);
+ else
+ ret = btrfs_validate_metadata_buffer(io_bio,
+ offset, page, start, end, mirror);
if (ret)
uptodate = 0;
else
@@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate))
goto readpage_ok;
- if (data_inode) {
+ if (is_data_inode(inode)) {
/*
* The generic bio_readpage_error handles errors the
@@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (!btrfs_submit_read_repair(inode, bio, offset, page,
start - page_offset(page),
start, end, mirror,
- tree->ops->submit_bio_hook)) {
+ btrfs_submit_data_bio)) {
uptodate = !bio->bi_status;
offset += len;
continue;
@@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf,
else
contig = bio_end_sector(bio) == sector;
- ASSERT(tree->ops);
if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags))
can_merge = false;
@@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page)
static struct extent_map *
__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
- u64 start, u64 len, get_extent_t *get_extent,
- struct extent_map **em_cached)
+ u64 start, u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
@@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
*em_cached = NULL;
}
- em = get_extent(BTRFS_I(inode), page, pg_offset, start, len);
+ em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
if (em_cached && !IS_ERR_OR_NULL(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
@@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* XXX JDM: This needs looking at to ensure proper page locking
* return 0 on success, otherwise return error
*/
-static int __do_readpage(struct page *page,
- get_extent_t *get_extent,
- struct extent_map **em_cached,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags, unsigned int read_flags,
- u64 *prev_em_start)
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page,
break;
}
em = __get_extent_map(inode, page, pg_offset, cur,
- end - cur + 1, get_extent, em_cached);
+ end - cur + 1, em_cached);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
unlock_extent(tree, cur, end);
@@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page,
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page,
ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
page, offset, disk_io_size,
pg_offset, bio,
- end_bio_extent_readpage, mirror_num,
+ end_bio_extent_readpage, 0,
*bio_flags,
this_bio_flag,
force_bio_submit);
@@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
for (index = 0; index < nr_pages; index++) {
- __do_readpage(pages[index], btrfs_get_extent, em_cached,
- bio, 0, bio_flags, REQ_RAHEAD, prev_em_start);
+ btrfs_do_readpage(pages[index], em_cached, bio, bio_flags,
+ REQ_RAHEAD, prev_em_start);
put_page(pages[index]);
}
}
-static int __extent_read_full_page(struct page *page,
- get_extent_t *get_extent,
- struct bio **bio, int mirror_num,
- unsigned long *bio_flags,
- unsigned int read_flags)
-{
- struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
- u64 start = page_offset(page);
- u64 end = start + PAGE_SIZE - 1;
- int ret;
-
- btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
-
- ret = __do_readpage(page, get_extent, NULL, bio, mirror_num,
- bio_flags, read_flags, NULL);
- return ret;
-}
-
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num)
-{
- struct bio *bio = NULL;
- unsigned long bio_flags = 0;
- int ret;
-
- ret = __extent_read_full_page(page, get_extent, &bio, mirror_num,
- &bio_flags, 0);
- if (bio)
- ret = submit_one_bio(bio, mirror_num, bio_flags);
- return ret;
-}
-
static void update_nr_written(struct writeback_control *wbc,
unsigned long nr_written)
{
@@ -4552,7 +4516,7 @@ next:
* helper function for fiemap, which doesn't want to see any holes.
* This maps until we find something past 'last'
*/
-static struct extent_map *get_extent_skip_holes(struct inode *inode,
+static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
u64 offset, u64 last)
{
u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (len == 0)
break;
len = ALIGN(len, sectorsize);
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len);
+ em = btrfs_get_extent_fiemap(inode, offset, len);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
return ret;
}
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret = 0;
@@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 last;
u64 last_for_get_extent = 0;
u64 disko = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
struct ulist *roots;
struct ulist *tmp_ulist;
@@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* lookup the last file extent. We're not using i_size here
* because there might be preallocation past i_size
*/
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), -1, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
+ 0);
if (ret < 0) {
goto out_free_ulist;
} else {
@@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
found_type = found_key.type;
/* No extents, but there might be delalloc bits */
- if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (found_key.objectid != btrfs_ino(inode) ||
found_type != BTRFS_EXTENT_DATA_KEY) {
/* have to trust i_size as the end */
last = (u64)-1;
@@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ lock_extent_bits(&inode->io_tree, start, start + len - 1,
&cached_state);
em = get_extent_skip_holes(inode, start, last_for_get_extent);
@@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* then we're just getting a count and we can skip the
* lookup stuff.
*/
- ret = btrfs_check_shared(root,
- btrfs_ino(BTRFS_I(inode)),
+ ret = btrfs_check_shared(root, btrfs_ino(inode),
bytenr, roots, tmp_ulist);
if (ret < 0)
goto out_free;
@@ -4898,7 +4861,7 @@ out_free:
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+ unlock_extent_cached(&inode->io_tree, start, start + len - 1,
&cached_state);
out_free_ulist:
@@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
rwlock_init(&eb->lock);
atomic_set(&eb->blocking_readers, 0);
eb->blocking_writers = 0;
- eb->lock_nested = false;
+ eb->lock_recursed = false;
init_waitqueue_head(&eb->write_lock_wq);
init_waitqueue_head(&eb->read_lock_wq);
@@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
}
ClearPageError(page);
- err = __extent_read_full_page(page,
- btree_get_extent, &bio,
- mirror_num, &bio_flags,
- REQ_META);
+ err = submit_extent_page(REQ_OP_READ | REQ_META, NULL,
+ page, page_offset(page), PAGE_SIZE, 0,
+ &bio, end_bio_extent_readpage,
+ mirror_num, 0, 0, false);
if (err) {
- ret = err;
/*
- * We use &bio in above __extent_read_full_page,
- * so we ensure that if it returns error, the
- * current page fails to add itself to bio and
- * it's been unlocked.
- *
- * We must dec io_pages by ourselves.
+ * We failed to submit the bio so it's the
+ * caller's responsibility to perform cleanup
+ * i.e unlock page/set error bit.
*/
+ ret = err;
+ SetPageError(page);
+ unlock_page(page);
atomic_dec(&eb->io_pages);
}
} else {
@@ -5622,6 +5584,36 @@ unlock_exit:
return ret;
}
+static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+ unsigned long len)
+{
+ btrfs_warn(eb->fs_info,
+ "access to eb bytenr %llu len %lu out of range start %lu len %lu",
+ eb->start, eb->len, start, len);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+
+ return true;
+}
+
+/*
+ * Check if the [start, start + len) range is valid before reading/writing
+ * the eb.
+ * NOTE: @start and @len are offset inside the eb, not logical address.
+ *
+ * Caller should not touch the dst/src memory if this function returns error.
+ */
+static inline int check_eb_range(const struct extent_buffer *eb,
+ unsigned long start, unsigned long len)
+{
+ unsigned long offset;
+
+ /* start, start + len should not go beyond eb->len nor overflow */
+ if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
+ return report_eb_range(eb, start, len);
+
+ return false;
+}
+
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
@@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
char *dst = (char *)dstv;
unsigned long i = start >> PAGE_SHIFT;
- if (start + len > eb->len) {
- WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
- eb->start, eb->len, start, len);
- memset(dst, 0, len);
+ if (check_eb_range(eb, start, len))
return;
- }
offset = offset_in_page(start);
@@ -5655,9 +5643,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dstv,
- unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5677,7 +5665,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (copy_to_user(dst, kaddr + offset, cur)) {
+ if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
@@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long i = start >> PAGE_SHIFT;
int ret = 0;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return -EINVAL;
offset = offset_in_page(start);
@@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
char *src = (char *)srcv;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
char *kaddr;
unsigned long i = start >> PAGE_SHIFT;
- WARN_ON(start > eb->len);
- WARN_ON(start + len > eb->start + eb->len);
+ if (check_eb_range(eb, start, len))
+ return;
offset = offset_in_page(start);
@@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst,
char *kaddr;
unsigned long i = dst_offset >> PAGE_SHIFT;
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(src, src_offset, len))
+ return;
+
WARN_ON(src->len != dst_len);
offset = offset_in_page(dst_offset);
@@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu dst len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu dst len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
while (len > 0) {
dst_off_in_page = offset_in_page(dst_offset);
@@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
- struct btrfs_fs_info *fs_info = dst->fs_info;
size_t cur;
size_t dst_off_in_page;
size_t src_off_in_page;
@@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_i;
unsigned long src_i;
- if (src_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus src_offset %lu move len %lu len %lu",
- src_offset, len, dst->len);
- BUG();
- }
- if (dst_offset + len > dst->len) {
- btrfs_err(fs_info,
- "memmove bogus dst_offset %lu move len %lu len %lu",
- dst_offset, len, dst->len);
- BUG();
- }
+ if (check_eb_range(dst, dst_offset, len) ||
+ check_eb_range(dst, src_offset, len))
+ return;
if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 00a88f2eb5ab..f39d02e7f7ef 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
typedef blk_status_t (extent_submit_bio_start_t)(void *private_data,
struct bio *bio, u64 bio_offset);
-struct extent_io_ops {
- /*
- * The following callbacks must be always defined, the function
- * pointer will be called unconditionally.
- */
- submit_bio_hook_t *submit_bio_hook;
- int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
- struct page *page, u64 start, u64 end,
- int mirror);
-};
-
-
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
struct extent_buffer {
@@ -102,7 +90,7 @@ struct extent_buffer {
int blocking_writers;
atomic_t blocking_readers;
- bool lock_nested;
+ bool lock_recursed;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
short log_index;
@@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
-int extent_read_full_page(struct page *page, get_extent_t *get_extent,
- int mirror_num);
+int __must_check submit_one_bio(struct bio *bio, int mirror_num,
+ unsigned long bio_flags);
+int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ struct bio **bio, unsigned long *bio_flags,
+ unsigned int read_flags, u64 *prev_em_start);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end,
int mode);
@@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping,
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
void extent_readahead(struct readahead_control *rac);
-int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
void set_page_extent_mapped(struct page *page);
@@ -241,9 +232,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dst, unsigned long start,
- unsigned long len);
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
+ unsigned long len);
void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src);
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *src);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 7d5ec71615b8..8f4f2bd6d9b9 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
if (page_offsets)
offset = page_offset(bvec.bv_page) + bvec.bv_offset;
- count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
- csum, nblocks);
+ count = btrfs_find_ordered_sum(BTRFS_I(inode), offset,
+ disk_bytenr, csum, nblocks);
if (count)
goto found;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4507c3d09399..0ff659455b1e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1057,11 +1057,7 @@ delete_extent_item:
if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
path->slots[0]++;
}
- setup_items_for_insert(root, path, &key,
- &extent_item_size,
- extent_item_size,
- sizeof(struct btrfs_item) +
- extent_item_size, 1);
+ setup_items_for_insert(root, path, &key, &extent_item_size, 1);
*key_inserted = 1;
}
@@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
- last_pos = start_pos
- + round_up(pos + write_bytes - start_pos,
- fs_info->sectorsize) - 1;
+ last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
if (start_pos < inode->vfs_inode.i_size) {
struct btrfs_ordered_extent *ordered;
@@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_start_ordered_extent(&inode->vfs_inode,
- ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
}
@@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, from);
+ written = btrfs_direct_IO(iocb, from);
if (written < 0 || !iov_iter_count(from))
return written;
@@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * 1. We must always clear IOCB_DSYNC in order to not deadlock
+ * in iomap, as it calls generic_write_sync() in this case.
+ * 2. If we are async, we can call iomap_dio_complete() either
+ * in
+ *
+ * 2.1. A worker thread from the last bio completed. In this
+ * case we need to mark the btrfs_dio_data that it is
+ * async in order to call generic_write_sync() properly.
+ * This is handled by setting BTRFS_DIO_SYNC_STUB in the
+ * current->journal_info.
+ * 2.2 The submitter context, because all IO completed
+ * before we exited iomap_dio_rw(). In this case we can
+ * just re-set the IOCB_DSYNC on the iocb and we'll do
+ * the sync below. If our ->end_io() gets called and
+ * current->journal_info is set, then we know we're in
+ * our current context and we will clear
+ * current->journal_info to indicate that we need to
+ * sync below.
+ */
+ if (sync) {
+ ASSERT(current->journal_info == NULL);
+ iocb->ki_flags &= ~IOCB_DSYNC;
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
num_written = __btrfs_direct_write(iocb, from);
+
+ /*
+ * As stated above, we cleared journal_info, so we need to do
+ * the sync ourselves.
+ */
+ if (sync && current->journal_info == NULL)
+ iocb->ki_flags |= IOCB_DSYNC;
+ current->journal_info = NULL;
} else {
num_written = btrfs_buffered_write(iocb, from);
if (num_written > 0)
@@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
filp->private_data = NULL;
/*
- * ordered_data_close is set by setattr when we are about to truncate
- * a file from a non-zero size to a zero size. This tries to
- * flush down new bytes that may have been written if the
- * application were using truncate to replace a file in place.
+ * Set by setattr when we are about to truncate a file from a non-zero
+ * size to a zero size. This tries to flush down new bytes that may
+ * have been written if the application were using truncate to replace
+ * a file in place.
*/
- if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
return 0;
@@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct btrfs_trans_handle *trans;
struct btrfs_log_ctx ctx;
int ret = 0, err;
+ u64 len;
+ bool full_sync;
trace_btrfs_sync_file(file, datasync);
btrfs_init_log_ctx(&ctx, inode);
/*
- * Set the range to full if the NO_HOLES feature is not enabled.
- * This is to avoid missing file extent items representing holes after
- * replaying the log.
+ * Always set the range to a full range, otherwise we can get into
+ * several problems, from missing file extent items to represent holes
+ * when not using the NO_HOLES feature, to log tree corruption due to
+ * races between hole detection during logging and completion of ordered
+ * extents outside the range, to missing checksums due to ordered extents
+ * for which we flushed only a subset of their pages.
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
- start = 0;
- end = LLONG_MAX;
- }
+ start = 0;
+ end = LLONG_MAX;
+ len = (u64)LLONG_MAX + 1;
/*
* We write the dirty pages in the range and wait until they complete
@@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * If the inode needs a full sync, make sure we use a full range to
- * avoid log tree corruption, due to hole detection racing with ordered
- * extent completion for adjacent ranges and races between logging and
- * completion of ordered extents for adjancent ranges - both races
- * could lead to file extent items in the log with overlapping ranges.
- * Do this while holding the inode lock, to avoid races with other
- * tasks.
+ * Always check for the full sync flag while holding the inode's lock,
+ * to avoid races with other tasks. The flag must be either set all the
+ * time during logging or always off all the time while logging.
*/
- if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- start = 0;
- end = LLONG_MAX;
- }
+ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
/*
* Before we acquired the inode's lock, someone may have dirtied more
@@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* We have to do this here to avoid the priority inversion of waiting on
* IO of a lower priority task while holding a transaction open.
*
- * Also, the range length can be represented by u64, we have to do the
- * typecasts to avoid signed overflow if it's [0, LLONG_MAX].
+ * For a full fsync we wait for the ordered extents to complete while
+ * for a fast fsync we wait just for writeback to complete, and then
+ * attach the ordered extents to the transaction so that a transaction
+ * commit waits for their completion, to avoid data loss if we fsync,
+ * the current transaction commits before the ordered extents complete
+ * and a power failure happens right after that.
*/
- ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1);
- if (ret) {
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ if (full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ } else {
+ /*
+ * Get our ordered extents as soon as possible to avoid doing
+ * checksum lookups in the csum tree, and use instead the
+ * checksums attached to the ordered extents.
+ */
+ btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
+ &ctx.ordered_extents);
+ ret = filemap_fdatawait_range(inode->i_mapping, start, end);
}
+
+ if (ret)
+ goto out_release_extents;
+
atomic_inc(&root->log_batch);
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
smp_mb();
if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
+ (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
+ (full_sync || list_empty(&ctx.ordered_extents)))) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* checked called fsync.
*/
ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
/*
@@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- up_write(&BTRFS_I(inode)->dio_sem);
- inode_unlock(inode);
- goto out;
+ goto out_release_extents;
}
- ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
+ ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
+ btrfs_release_log_ctx_extents(&ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
goto out;
}
}
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, len);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ goto out;
+ }
+ }
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_end_transaction(trans);
@@ -2286,6 +2335,12 @@ out:
if (!ret)
ret = err;
return ret > 0 ? -EIO : ret;
+
+out_release_extents:
+ btrfs_release_log_ctx_extents(&ctx);
+ up_write(&BTRFS_I(inode)->dio_sem);
+ inode_unlock(inode);
+ goto out;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ lockend);
/*
* We need to make sure we have no ordered extents in this range
@@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
return 0;
}
-static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
+static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *path,
- struct btrfs_clone_extent_info *clone_info,
- const u64 clone_len)
+ struct btrfs_replace_extent_info *extent_info,
+ const u64 replace_len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
int slot;
struct btrfs_ref ref = { 0 };
- u64 ref_offset;
int ret;
- if (clone_len == 0)
+ if (replace_len == 0)
return 0;
- if (clone_info->disk_offset == 0 &&
+ if (extent_info->disk_offset == 0 &&
btrfs_fs_incompat(fs_info, NO_HOLES))
return 0;
key.objectid = btrfs_ino(BTRFS_I(inode));
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = clone_info->file_offset;
+ key.offset = extent_info->file_offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
if (ret)
return ret;
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, clone_info->extent_buf,
+ write_extent_buffer(leaf, extent_info->extent_buf,
btrfs_item_ptr_offset(leaf, slot),
- clone_info->item_size);
+ sizeof(struct btrfs_file_extent_item));
extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset);
- btrfs_set_file_extent_num_bytes(leaf, extent, clone_len);
+ ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
+ btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
+ btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
+ if (extent_info->is_new_extent)
+ btrfs_set_file_extent_generation(leaf, extent, trans->transid);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
- clone_info->file_offset, clone_len);
+ extent_info->file_offset, replace_len);
if (ret)
return ret;
/* If it's a hole, nothing more needs to be done. */
- if (clone_info->disk_offset == 0)
+ if (extent_info->disk_offset == 0)
return 0;
- inode_add_bytes(inode, clone_len);
- btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
- clone_info->disk_offset,
- clone_info->disk_len, 0);
- ref_offset = clone_info->file_offset - clone_info->data_offset;
- btrfs_init_data_ref(&ref, root->root_key.objectid,
- btrfs_ino(BTRFS_I(inode)), ref_offset);
- ret = btrfs_inc_extent_ref(trans, &ref);
+ inode_add_bytes(inode, replace_len);
+
+ if (extent_info->is_new_extent && extent_info->insertions == 0) {
+ key.objectid = extent_info->disk_offset;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = extent_info->disk_len;
+ ret = btrfs_alloc_reserved_file_extent(trans, root,
+ btrfs_ino(BTRFS_I(inode)),
+ extent_info->file_offset,
+ extent_info->qgroup_reserved,
+ &key);
+ } else {
+ u64 ref_offset;
+
+ btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
+ extent_info->disk_offset,
+ extent_info->disk_len, 0);
+ ref_offset = extent_info->file_offset - extent_info->data_offset;
+ btrfs_init_data_ref(&ref, root->root_key.objectid,
+ btrfs_ino(BTRFS_I(inode)), ref_offset);
+ ret = btrfs_inc_extent_ref(trans, &ref);
+ }
+
+ extent_info->insertions++;
return ret;
}
@@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans,
/*
* The respective range must have been previously locked, as well as the inode.
* The end offset is inclusive (last byte of the range).
- * @clone_info is NULL for fallocate's hole punching and non-NULL for extent
- * cloning.
- * When cloning, we don't want to end up in a state where we dropped extents
- * without inserting a new one, so we must abort the transaction to avoid a
- * corruption.
+ * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
+ * the file range with an extent.
+ * When not punching a hole, we don't want to end up in a state where we dropped
+ * extents without inserting a new one, so we must abort the transaction to avoid
+ * a corruption.
*/
-int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
+int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
const u64 start, const u64 end,
- struct btrfs_clone_extent_info *clone_info,
+ struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
/*
* 1 - update the inode
* 1 - removing the extents in the range
- * 1 - adding the hole extent if no_holes isn't set or if we are cloning
- * an extent
+ * 1 - adding the hole extent if no_holes isn't set or if we are
+ * replacing the range with a new extent
*/
- if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info)
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
rsv_count = 3;
else
rsv_count = 2;
@@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* returned by __btrfs_drop_extents() without having
* changed anything in the file.
*/
- if (clone_info && ret && ret != -EOPNOTSUPP)
+ if (extent_info && !extent_info->is_new_extent &&
+ ret && ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans, ret);
break;
}
trans->block_rsv = &fs_info->trans_block_rsv;
- if (!clone_info && cur_offset < drop_end &&
+ if (!extent_info && cur_offset < drop_end &&
cur_offset < ino_size) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
@@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
break;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/*
* We are past the i_size here, but since we didn't
* insert holes we need to clear the mapped area so we
@@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info && drop_end > clone_info->file_offset) {
- u64 clone_len = drop_end - clone_info->file_offset;
+ if (extent_info && drop_end > extent_info->file_offset) {
+ u64 replace_len = drop_end - extent_info->file_offset;
- ret = btrfs_insert_clone_extent(trans, inode, path,
- clone_info, clone_len);
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
- clone_info->data_len -= clone_len;
- clone_info->data_offset += clone_len;
- clone_info->file_offset += clone_len;
+ extent_info->data_len -= replace_len;
+ extent_info->data_offset += replace_len;
+ extent_info->file_offset += replace_len;
}
cur_offset = drop_end;
@@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
- if (!clone_info) {
+ if (!extent_info) {
ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
@@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* than 16Mb would force the full fsync any way (when
* try_release_extent_mapping() is invoked during page cache truncation.
*/
- if (clone_info)
+ if (extent_info && !extent_info->is_new_extent)
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
@@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
*/
- if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) {
+ if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
ret = fill_holes(trans, BTRFS_I(inode), path,
cur_offset, drop_end);
if (ret) {
@@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
btrfs_abort_transaction(trans, ret);
goto out_trans;
}
- } else if (!clone_info && cur_offset < drop_end) {
+ } else if (!extent_info && cur_offset < drop_end) {
/* See the comment in the loop above for the reasoning here. */
ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
cur_offset, drop_end - cur_offset);
@@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path,
}
}
- if (clone_info) {
- ret = btrfs_insert_clone_extent(trans, inode, path, clone_info,
- clone_info->data_len);
+ if (extent_info) {
+ ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
+ extent_info->data_len);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out_only_mutex;
}
- lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
+ lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
- btrfs_inode_sectorsize(inode)) - 1;
+ btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
@@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL,
+ ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
&trans);
btrfs_free_path(path);
if (ret)
@@ -3044,7 +3119,7 @@ enum {
RANGE_BOUNDARY_HOLE,
};
-static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
u64 offset)
{
const u64 sectorsize = btrfs_inode_sectorsize(inode);
@@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode,
int ret;
offset = round_down(offset, sectorsize);
- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
+ em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
if (IS_ERR(em))
return PTR_ERR(em);
@@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode,
struct extent_changeset *data_reserved = NULL;
int ret;
u64 alloc_hint = 0;
- const u64 sectorsize = btrfs_inode_sectorsize(inode);
+ const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode,
* to cover them.
*/
if (!IS_ALIGNED(offset, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode, offset);
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
+ offset);
if (ret < 0)
goto out;
if (ret == RANGE_BOUNDARY_HOLE) {
@@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode,
}
if (!IS_ALIGNED(offset + len, sectorsize)) {
- ret = btrfs_zero_range_check_range_boundary(inode,
+ ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
offset + len);
if (ret < 0)
goto out;
@@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 locked_end;
u64 actual_end = 0;
struct extent_map *em;
- int blocksize = btrfs_inode_sectorsize(inode);
+ int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
alloc_start = round_down(offset, blocksize);
@@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
+ locked_end);
if (ordered &&
ordered->file_offset + ordered->num_bytes > alloc_start &&
@@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
return generic_file_open(inode, filp);
}
+static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ ssize_t ret = 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ inode_lock_shared(inode);
+ ret = btrfs_direct_IO(iocb, to);
+ inode_unlock_shared(inode);
+ if (ret < 0)
+ return ret;
+ }
+
+ return generic_file_buffered_read(iocb, to, ret);
+}
+
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = btrfs_file_read_iter,
.splice_read = generic_file_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index dc82fd0c80cb..af0013d3df63 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *val;
-
io_ctl_map_page(io_ctl, 1);
/*
@@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- val = io_ctl->cur;
- *val = cpu_to_le64(generation);
+ put_unaligned_le64(generation, io_ctl->cur);
io_ctl->cur += sizeof(u64);
}
static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
{
- __le64 *gen;
+ u64 cache_gen;
/*
* Skip the crc area. If we don't check crcs then we just have a 64bit
@@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
io_ctl->size -= sizeof(u64) * 2;
}
- gen = io_ctl->cur;
- if (le64_to_cpu(*gen) != generation) {
+ cache_gen = get_unaligned_le64(io_ctl->cur);
+ if (cache_gen != generation) {
btrfs_err_rl(io_ctl->fs_info,
"space cache generation (%llu) does not match inode (%llu)",
- *gen, generation);
+ cache_gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
return -ENOSPC;
entry = io_ctl->cur;
- entry->offset = cpu_to_le64(offset);
- entry->bytes = cpu_to_le64(bytes);
+ put_unaligned_le64(offset, &entry->offset);
+ put_unaligned_le64(bytes, &entry->bytes);
entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
BTRFS_FREE_SPACE_EXTENT;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
@@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
}
e = io_ctl->cur;
- entry->offset = le64_to_cpu(e->offset);
- entry->bytes = le64_to_cpu(e->bytes);
+ entry->offset = get_unaligned_le64(&e->offset);
+ entry->bytes = get_unaligned_le64(&e->bytes);
*type = e->type;
io_ctl->cur += sizeof(struct btrfs_free_space_entry);
io_ctl->size -= sizeof(struct btrfs_free_space_entry);
@@ -1353,7 +1350,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/*
* at this point the pages are under IO and we're happy,
- * The caller is responsible for waiting on them and updating the
+ * The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 8b1f5c8897b7..6b9faf3b0e96 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -22,6 +22,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
size_t bitmap_size;
u64 num_bitmaps, total_bitmap_size;
+ if (WARN_ON(cache->length == 0))
+ btrfs_warn(cache->fs_info, "block group %llu length is zero",
+ cache->start);
+
/*
* We convert to bitmaps when the disk space required for using extents
* exceeds that required for using bitmaps.
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9570458aa847..936c3137c646 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6,7 +6,6 @@
#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/bio.h>
-#include <linux/buffer_head.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -31,6 +30,7 @@
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/sched/mm.h>
+#include <linux/iomap.h>
#include <asm/unaligned.h>
#include "misc.h"
#include "ctree.h"
@@ -59,9 +59,10 @@ struct btrfs_iget_args {
struct btrfs_dio_data {
u64 reserve;
- u64 unsubmitted_oe_range_start;
- u64 unsubmitted_oe_range_end;
- int overwrite;
+ loff_t length;
+ ssize_t submitted;
+ struct extent_changeset *data_reserved;
+ bool sync;
};
static const struct inode_operations btrfs_dir_inode_operations;
@@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
static const struct file_operations btrfs_dir_file_operations;
-static const struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
@@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
static int btrfs_dirty_inode(struct inode *inode);
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode)
-{
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-}
-#endif
-
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
@@ -2183,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
*
* c-3) otherwise: async submit
*/
-static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
- int mirror_num,
- unsigned long bio_flags)
+blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
+ int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2245,16 +2237,15 @@ out:
* given a list of ordered sums record them in the inode. This happens
* at IO completion time based on sums calculated at bio submission time.
*/
-static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
- struct inode *inode, struct list_head *list)
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+ struct list_head *list)
{
struct btrfs_ordered_sum *sum;
int ret;
list_for_each_entry(sum, list, list) {
trans->adding_csums = true;
- ret = btrfs_csum_file_blocks(trans,
- BTRFS_I(inode)->root->fs_info->csum_root, sum);
+ ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum);
trans->adding_csums = false;
if (ret)
return ret;
@@ -2357,7 +2348,7 @@ again:
unlock_extent_cached(&inode->io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -2548,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
}
static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
- struct inode *inode,
struct btrfs_ordered_extent *oe)
{
struct btrfs_file_extent_item stack_fi;
@@ -2568,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
/* Encryption and other encoding is reserved and all 0 */
- return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset,
- &stack_fi, oe->qgroup_rsv);
+ return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
+ oe->file_offset, &stack_fi,
+ oe->qgroup_rsv);
}
/*
@@ -2666,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len);
} else {
BUG_ON(root == fs_info->tree_root);
- ret = insert_ordered_extent_file_extent(trans, inode,
- ordered_extent);
+ ret = insert_ordered_extent_file_extent(trans, ordered_extent);
if (!ret) {
clear_reserved_extent = false;
btrfs_release_delalloc_bytes(fs_info,
@@ -2683,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- ret = add_pending_csums(trans, inode, &ordered_extent->list);
+ ret = add_pending_csums(trans, &ordered_extent->list);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -2752,7 +2742,7 @@ out:
* This needs to be done to make sure anybody waiting knows we are done
* updating everything for this ordered extent.
*/
- btrfs_remove_ordered_extent(inode, ordered_extent);
+ btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent);
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -2772,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work)
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
u64 end, int uptodate)
{
- struct inode *inode = page->mapping->host;
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
@@ -2784,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+ if (btrfs_is_free_space_inode(inode))
wq = fs_info->endio_freespace_worker;
else
wq = fs_info->endio_write_workers;
@@ -2833,9 +2823,8 @@ zeroit:
* if there's a match, we allow the bio to finish. If not, the code in
* extent_io.c will try to find good copies for us.
*/
-static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
- u64 phy_offset, struct page *page,
- u64 start, u64 end, int mirror)
+int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
+ struct page *page, u64 start, u64 end, int mirror)
{
size_t offset = start - page_offset(page);
struct inode *inode = page->mapping->host;
@@ -3055,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
if (ret == -ENOENT && root == fs_info->tree_root) {
struct btrfs_root *dead_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
/*
@@ -3395,7 +3383,6 @@ cache_acl:
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
break;
@@ -4051,7 +4038,7 @@ out_end_trans:
err = ret;
inode->i_flags |= S_DEAD;
out_release:
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
out_up_write:
up_write(&fs_info->subvol_sem);
if (err) {
@@ -4583,7 +4570,7 @@ again:
&cached_state);
unlock_page(page);
put_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -4848,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
/*
* We're truncating a file that used to have good data down to
- * zero. Make sure it gets into the ordered flush list so that
- * any new writes get down to disk quickly.
+ * zero. Make sure any new writes to the file get on disk
+ * on close.
*/
if (newsize == 0)
- set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+ set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
&BTRFS_I(inode)->runtime_flags);
truncate_setsize(inode, newsize);
- /* Disable nonlocked read DIO to avoid the endless truncate */
- btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
inode_dio_wait(inode);
- btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
ret = btrfs_truncate(inode, newsize == oldsize);
if (ret && inode->i_nlink) {
@@ -5305,15 +5289,15 @@ static void inode_tree_add(struct inode *inode)
spin_unlock(&root->inode_lock);
}
-static void inode_tree_del(struct inode *inode)
+static void inode_tree_del(struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
int empty = 0;
spin_lock(&root->inode_lock);
- if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ if (!RB_EMPTY_NODE(&inode->rb_node)) {
+ rb_erase(&inode->rb_node, &root->inode_tree);
+ RB_CLEAR_NODE(&inode->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
@@ -6311,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (err)
goto out_unlock;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
d_instantiate_new(dentry, inode);
out_unlock:
@@ -6374,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- int ret;
err = btrfs_update_inode(trans, root, inode);
if (err)
@@ -6389,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
- true, NULL);
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
- err = btrfs_commit_transaction(trans);
- trans = NULL;
- }
+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
fail:
@@ -6540,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
u64 start, u64 len)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- int ret;
- int err = 0;
+ int ret = 0;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -6569,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
}
em = alloc_extent_map();
if (!em) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
em->start = EXTENT_MAP_HOLE;
@@ -6579,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto out;
}
@@ -6592,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
*/
path->leave_spinning = 1;
+ path->recurse = btrfs_is_free_space_inode(inode);
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
- err = ret;
goto out;
} else if (ret > 0) {
if (path->slots[0] == 0)
goto not_found;
path->slots[0]--;
+ ret = 0;
}
leaf = path->nodes[0];
@@ -6625,7 +6603,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
if (!S_ISREG(inode->vfs_inode.i_mode)) {
- err = -EUCLEAN;
+ ret = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode));
@@ -6643,12 +6621,11 @@ next:
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- err = ret;
+ if (ret < 0)
goto out;
- } else if (ret > 0) {
+ else if (ret > 0)
goto not_found;
- }
+
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
@@ -6699,10 +6676,8 @@ next:
BTRFS_COMPRESS_NONE) {
ret = uncompress_inline(path, page, pg_offset,
extent_offset, item);
- if (ret) {
- err = ret;
+ if (ret)
goto out;
- }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6726,29 +6701,28 @@ not_found:
em->len = len;
em->block_start = EXTENT_MAP_HOLE;
insert:
+ ret = 0;
btrfs_release_path(path);
if (em->start > start || extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
- err = -EIO;
+ ret = -EIO;
goto out;
}
- err = 0;
write_lock(&em_tree->lock);
- err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- if (err) {
+ if (ret) {
free_extent_map(em);
- return ERR_PTR(err);
+ return ERR_PTR(ret);
}
- BUG_ON(!em); /* Error is always set */
return em;
}
@@ -7111,7 +7085,7 @@ out:
}
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
- struct extent_state **cached_state, int writing)
+ struct extent_state **cached_state, bool writing)
{
struct btrfs_ordered_extent *ordered;
int ret = 0;
@@ -7160,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
*/
if (writing ||
test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
else
ret = -ENOTBLK;
btrfs_put_ordered_extent(ordered);
@@ -7249,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
}
-static int btrfs_get_blocks_direct_read(struct extent_map *em,
- struct buffer_head *bh_result,
- struct inode *inode,
- u64 start, u64 len)
-{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-
- if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- return -ENOENT;
-
- len = min(len, em->len - (start - em->start));
-
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- return 0;
-}
-
static int btrfs_get_blocks_direct_write(struct extent_map **map,
- struct buffer_head *bh_result,
struct inode *inode,
struct btrfs_dio_data *dio_data,
u64 start, u64 len)
@@ -7333,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
}
/* this will cow the extent */
- len = bh_result->b_size;
free_extent_map(em);
*map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
if (IS_ERR(em)) {
@@ -7344,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
len = min(len, em->len - (start - em->start));
skip_cow:
- bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
- inode->i_blkbits;
- bh_result->b_size = len;
- bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
- set_buffer_mapped(bh_result);
-
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
- set_buffer_new(bh_result);
-
/*
* Need to update the i_size under the extent lock so buffered
* readers will get the updated i_size when we unlock.
*/
- if (!dio_data->overwrite && start + len > i_size_read(inode))
+ if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
- dio_data->unsubmitted_oe_range_end = start + len;
- current->journal_info = dio_data;
out:
return ret;
}
-static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map *em;
struct extent_state *cached_state = NULL;
struct btrfs_dio_data *dio_data = NULL;
- u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
- u64 len = bh_result->b_size;
+ const bool write = !!(flags & IOMAP_WRITE);
int ret = 0;
+ u64 len = length;
+ bool unlock_extents = false;
+ bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB);
+
+ /*
+ * We used current->journal_info here to see if we were sync, but
+ * there's a lot of tests in the enospc machinery to not do flushing if
+ * we have a journal_info set, so we need to clear this out and re-set
+ * it in iomap_end.
+ */
+ ASSERT(current->journal_info == NULL ||
+ current->journal_info == BTRFS_DIO_SYNC_STUB);
+ current->journal_info = NULL;
- if (!create)
+ if (!write)
len = min_t(u64, len, fs_info->sectorsize);
lockstart = start;
lockend = start + len - 1;
- if (current->journal_info) {
- /*
- * Need to pull our outstanding extents and set journal_info to NULL so
- * that anything that needs to check if there's a transaction doesn't get
- * confused.
- */
- dio_data = current->journal_info;
- current->journal_info = NULL;
+ /*
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so we
+ * need to flush the dirty pages again to make absolutely sure that any
+ * outstanding dirty pages are on disk.
+ */
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ start + length - 1);
+ if (ret)
+ return ret;
+ }
+
+ dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
+ if (!dio_data)
+ return -ENOMEM;
+
+ dio_data->sync = sync;
+ dio_data->length = length;
+ if (write) {
+ dio_data->reserve = round_up(length, fs_info->sectorsize);
+ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
+ &dio_data->data_reserved,
+ start, dio_data->reserve);
+ if (ret) {
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ return ret;
+ }
}
+ iomap->private = dio_data;
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
- create)) {
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
ret = -ENOTBLK;
goto err;
}
@@ -7433,35 +7407,47 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
goto unlock_err;
}
- if (create) {
- ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
- dio_data, start, len);
+ len = min(len, em->len - (start - em->start));
+ if (write) {
+ ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+ start, len);
if (ret < 0)
goto unlock_err;
-
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ unlock_extents = true;
+ /* Recalc len in case the new em is smaller than requested */
+ len = min(len, em->len - (start - em->start));
} else {
- ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
- start, len);
- /* Can be negative only if we read from a hole */
- if (ret < 0) {
- ret = 0;
- free_extent_map(em);
- goto unlock_err;
- }
/*
* We need to unlock only the end area that we aren't using.
* The rest is going to be unlocked by the endio routine.
*/
- lockstart = start + bh_result->b_size;
- if (lockstart < lockend) {
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, &cached_state);
- } else {
- free_extent_state(cached_state);
- }
+ lockstart = start + len;
+ if (lockstart < lockend)
+ unlock_extents = true;
+ }
+
+ if (unlock_extents)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state);
+ else
+ free_extent_state(cached_state);
+
+ /*
+ * Translate extent map information to iomap.
+ * We trim the extents (and move the addr) even though iomap code does
+ * that, since we have locked only the parts we are performing I/O in.
+ */
+ if ((em->block_start == EXTENT_MAP_HOLE) ||
+ (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ } else {
+ iomap->addr = em->block_start + (start - em->start);
+ iomap->type = IOMAP_MAPPED;
}
+ iomap->offset = start;
+ iomap->bdev = fs_info->fs_devices->latest_bdev;
+ iomap->length = len;
free_extent_map(em);
@@ -7471,8 +7457,63 @@ unlock_err:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state);
err:
- if (dio_data)
- current->journal_info = dio_data;
+ if (dio_data) {
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, start,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve);
+ extent_changeset_free(dio_data->data_reserved);
+ kfree(dio_data);
+ }
+ return ret;
+}
+
+static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ int ret = 0;
+ struct btrfs_dio_data *dio_data = iomap->private;
+ size_t submitted = dio_data->submitted;
+ const bool write = !!(flags & IOMAP_WRITE);
+
+ if (!write && (iomap->type == IOMAP_HOLE)) {
+ /* If reading from a hole, unlock and return */
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
+ goto out;
+ }
+
+ if (submitted < length) {
+ pos += submitted;
+ length -= submitted;
+ if (write)
+ __endio_write_update_ordered(BTRFS_I(inode), pos,
+ length, false);
+ else
+ unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1);
+ ret = -ENOTBLK;
+ }
+
+ if (write) {
+ if (dio_data->reserve)
+ btrfs_delalloc_release_space(BTRFS_I(inode),
+ dio_data->data_reserved, pos,
+ dio_data->reserve, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length);
+ extent_changeset_free(dio_data->data_reserved);
+ }
+out:
+ /*
+ * We're all done, we can re-set the current->journal_info now safely
+ * for our endio.
+ */
+ if (dio_data->sync) {
+ ASSERT(current->journal_info == NULL);
+ current->journal_info = BTRFS_DIO_SYNC_STUB;
+ }
+ kfree(dio_data);
+ iomap->private = NULL;
+
return ret;
}
@@ -7496,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
dip->logical_offset + dip->bytes - 1);
}
- dio_end_io(dip->dio_bio);
+ bio_endio(dip->dio_bio);
kfree(dip);
}
@@ -7730,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
dip->dio_bio = dio_bio;
refcount_set(&dip->refs, 1);
-
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- /*
- * Setting range start and end to the same value means that
- * no cleanup will happen in btrfs_direct_IO
- */
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
return dip;
}
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
+static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
+ struct bio *dio_bio, loff_t file_offset)
{
const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
@@ -7764,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
int ret;
blk_status_t status;
struct btrfs_io_geometry geom;
+ struct btrfs_dio_data *dio_data = iomap->private;
dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
if (!dip) {
@@ -7772,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
file_offset + dio_bio->bi_iter.bi_size - 1);
}
dio_bio->bi_status = BLK_STS_RESOURCE;
- dio_end_io(dio_bio);
- return;
+ bio_endio(dio_bio);
+ return BLK_QC_T_NONE;
}
if (!write && csum) {
@@ -7844,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
goto out_err;
}
+ dio_data->submitted += clone_len;
clone_offset += clone_len;
start_sector += clone_len >> 9;
file_offset += clone_len;
} while (submit_len > 0);
- return;
+ return BLK_QC_T_NONE;
out_err:
dip->dio_bio->bi_status = status;
btrfs_dio_private_put(dip);
+ return BLK_QC_T_NONE;
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -7888,37 +7919,59 @@ out:
return retval;
}
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size,
+ int error, unsigned flags)
+{
+ /*
+ * Now if we're still in the context of our submitter we know we can't
+ * safely run generic_write_sync(), so clear our flag here so that the
+ * caller knows to follow up with a sync.
+ */
+ if (current->journal_info == BTRFS_DIO_SYNC_STUB) {
+ current->journal_info = NULL;
+ return error;
+ }
+
+ if (error)
+ return error;
+
+ if (size) {
+ iocb->ki_flags |= IOCB_DSYNC;
+ return generic_write_sync(iocb, size);
+ }
+
+ return 0;
+}
+
+static const struct iomap_ops btrfs_dio_iomap_ops = {
+ .iomap_begin = btrfs_dio_iomap_begin,
+ .iomap_end = btrfs_dio_iomap_end,
+};
+
+static const struct iomap_dio_ops btrfs_dio_ops = {
+ .submit_io = btrfs_submit_direct,
+};
+
+static const struct iomap_dio_ops btrfs_sync_dops = {
+ .submit_io = btrfs_submit_direct,
+ .end_io = btrfs_maybe_fsync_end_io,
+};
+
+ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct btrfs_dio_data dio_data = { 0 };
struct extent_changeset *data_reserved = NULL;
loff_t offset = iocb->ki_pos;
size_t count = 0;
- int flags = 0;
- bool wakeup = true;
bool relock = false;
ssize_t ret;
if (check_direct_IO(fs_info, iter, offset))
return 0;
- inode_dio_begin(inode);
-
- /*
- * The generic stuff only does filemap_write_and_wait_range, which
- * isn't enough if we've written compressed pages to this area, so
- * we need to flush the dirty pages again to make absolutely sure
- * that any outstanding dirty pages are on disk.
- */
count = iov_iter_count(iter);
- if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, offset,
- offset + count - 1);
-
if (iov_iter_rw(iter) == WRITE) {
/*
* If the write DIO is beyond the EOF, we need update
@@ -7926,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* not unlock the i_mutex at this case.
*/
if (offset + count <= inode->i_size) {
- dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- offset, count);
- if (ret)
- goto out;
-
- /*
- * We need to know how many extents we reserved so that we can
- * do the accounting properly if we go over the number we
- * originally calculated. Abuse current->journal_info for this.
- */
- dio_data.reserve = round_up(count,
- fs_info->sectorsize);
- dio_data.unsubmitted_oe_range_start = (u64)offset;
- dio_data.unsubmitted_oe_range_end = (u64)offset;
- current->journal_info = &dio_data;
down_read(&BTRFS_I(inode)->dio_sem);
- } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
- &BTRFS_I(inode)->runtime_flags)) {
- inode_dio_end(inode);
- flags = DIO_LOCKING | DIO_SKIP_HOLES;
- wakeup = false;
}
- ret = __blockdev_direct_IO(iocb, inode,
- fs_info->fs_devices->latest_bdev,
- iter, btrfs_get_blocks_direct, NULL,
- btrfs_submit_direct, flags);
- if (iov_iter_rw(iter) == WRITE) {
+ /*
+ * We have are actually a sync iocb, so we need our fancy endio to know
+ * if we need to sync.
+ */
+ if (current->journal_info)
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_sync_dops, is_sync_kiocb(iocb));
+ else
+ ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops,
+ &btrfs_dio_ops, is_sync_kiocb(iocb));
+
+ if (ret == -ENOTBLK)
+ ret = 0;
+
+ if (iov_iter_rw(iter) == WRITE)
up_read(&BTRFS_I(inode)->dio_sem);
- current->journal_info = NULL;
- if (ret < 0 && ret != -EIOCBQUEUED) {
- if (dio_data.reserve)
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, offset, dio_data.reserve,
- true);
- /*
- * On error we might have left some ordered extents
- * without submitting corresponding bios for them, so
- * cleanup them up to avoid other tasks getting them
- * and waiting for them to complete forever.
- */
- if (dio_data.unsubmitted_oe_range_start <
- dio_data.unsubmitted_oe_range_end)
- __endio_write_update_ordered(BTRFS_I(inode),
- dio_data.unsubmitted_oe_range_start,
- dio_data.unsubmitted_oe_range_end -
- dio_data.unsubmitted_oe_range_start,
- false);
- } else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count);
- }
-out:
- if (wakeup)
- inode_dio_end(inode);
+
if (relock)
inode_lock(inode);
@@ -8002,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- return extent_fiemap(inode, fieinfo, start, len);
+ return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
}
int btrfs_readpage(struct file *file, struct page *page)
{
- return extent_read_full_page(page, btrfs_get_extent, 0);
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ u64 start = page_offset(page);
+ u64 end = start + PAGE_SIZE - 1;
+ unsigned long bio_flags = 0;
+ struct bio *bio = NULL;
+ int ret;
+
+ btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+
+ ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL);
+ if (bio)
+ ret = submit_one_bio(bio, 0, bio_flags);
+ return ret;
}
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -8091,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping,
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length)
{
- struct inode *inode = page->mapping->host;
- struct extent_io_tree *tree;
+ struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ struct extent_io_tree *tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_SIZE - 1;
u64 start;
u64 end;
- int inode_evicting = inode->i_state & I_FREEING;
+ int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
/*
* we have the page locked, so new writeback can't start,
@@ -8110,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
*/
wait_on_page_writeback(page);
- tree = &BTRFS_I(inode)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
@@ -8120,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
lock_extent_bits(tree, page_start, page_end, &cached_state);
again:
start = page_start;
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
- page_end - start + 1);
+ ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1);
if (ordered) {
end = min(page_end,
ordered->file_offset + ordered->num_bytes - 1);
@@ -8142,7 +8168,7 @@ again:
struct btrfs_ordered_inode_tree *tree;
u64 new_len;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -8181,7 +8207,7 @@ again:
* bit of its io_tree, and free the qgroup reserved data space.
* Since the IO will never happen for this page.
*/
- btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
@@ -8283,7 +8309,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
@@ -8614,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
-void btrfs_destroy_inode(struct inode *inode)
+void btrfs_destroy_inode(struct inode *vfs_inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_inode *inode = BTRFS_I(vfs_inode);
+ struct btrfs_root *root = inode->root;
- WARN_ON(!hlist_empty(&inode->i_dentry));
- WARN_ON(inode->i_data.nrpages);
- WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
- WARN_ON(BTRFS_I(inode)->block_rsv.size);
- WARN_ON(BTRFS_I(inode)->outstanding_extents);
- WARN_ON(BTRFS_I(inode)->delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
- WARN_ON(BTRFS_I(inode)->csum_bytes);
- WARN_ON(BTRFS_I(inode)->defrag_bytes);
+ WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
+ WARN_ON(vfs_inode->i_data.nrpages);
+ WARN_ON(inode->block_rsv.reserved);
+ WARN_ON(inode->block_rsv.size);
+ WARN_ON(inode->outstanding_extents);
+ WARN_ON(inode->delalloc_bytes);
+ WARN_ON(inode->new_delalloc_bytes);
+ WARN_ON(inode->csum_bytes);
+ WARN_ON(inode->defrag_bytes);
/*
* This can happen where we create an inode, but somebody else also
@@ -8643,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode)
if (!ordered)
break;
else {
- btrfs_err(fs_info,
+ btrfs_err(root->fs_info,
"found ordered extent %llu %llu on inode cleanup",
ordered->file_offset, ordered->num_bytes);
btrfs_remove_ordered_extent(inode, ordered);
@@ -8651,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
- btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
- btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
- btrfs_put_root(BTRFS_I(inode)->root);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
+ btrfs_put_root(inode->root);
}
int btrfs_drop_inode(struct inode *inode)
@@ -8780,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
- struct dentry *parent;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
+ int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
- struct btrfs_log_ctx ctx_root;
- struct btrfs_log_ctx ctx_dest;
- bool sync_log_root = false;
- bool sync_log_dest = false;
- bool commit_transaction = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
- btrfs_init_log_ctx(&ctx_root, old_inode);
- btrfs_init_log_ctx(&ctx_dest, new_inode);
-
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -8942,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
if (root_log_pinned) {
- parent = new_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx_root);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_root = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- if (!commit_transaction) {
- parent = old_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
- BTRFS_I(new_dir), parent,
- false, &ctx_dest);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_dest = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- }
+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
+ old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -8998,46 +9000,13 @@ out_fail:
dest_log_pinned = false;
}
}
- if (!ret && sync_log_root && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
- &ctx_root);
- if (ret)
- commit_transaction = true;
- }
- if (!ret && sync_log_dest && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
- &ctx_dest);
- if (ret)
- commit_transaction = true;
- }
- if (commit_transaction) {
- /*
- * We may have set commit_transaction when logging the new name
- * in the destination root, in which case we left the source
- * root context in the list of log contextes. So make sure we
- * remove it to avoid invalid memory accesses, since the context
- * was allocated in our stack frame.
- */
- if (sync_log_root) {
- mutex_lock(&root->log_mutex);
- list_del_init(&ctx_root.list);
- mutex_unlock(&root->log_mutex);
- }
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
- ASSERT(list_empty(&ctx_root.list));
- ASSERT(list_empty(&ctx_dest.list));
-
return ret;
}
@@ -9105,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
int ret;
+ int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
- struct btrfs_log_ctx ctx;
- bool sync_log = false;
- bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9259,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = index;
if (log_pinned) {
- struct dentry *parent = new_dentry->d_parent;
-
- btrfs_init_log_ctx(&ctx, old_inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -9306,23 +9264,8 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- if (!ret && sync_log) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
- if (ret)
- commit_transaction = true;
- } else if (sync_log) {
- mutex_lock(&root->log_mutex);
- list_del(&ctx.list);
- mutex_unlock(&root->log_mutex);
- }
- if (commit_transaction) {
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -9388,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -9428,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
list_add_tail(&work->list, &works);
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
- ret++;
- if (nr != -1 && ret >= nr)
- goto out;
+ if (*nr != U64_MAX) {
+ (*nr)--;
+ if (*nr == 0)
+ goto out;
+ }
cond_resched();
spin_lock(&root->delalloc_lock);
}
@@ -9455,18 +9400,15 @@ out:
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
+ u64 nr = U64_MAX;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- ret = start_delalloc_inodes(root, -1, true);
- if (ret > 0)
- ret = 0;
- return ret;
+ return start_delalloc_inodes(root, &nr, true);
}
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
{
struct btrfs_root *root;
struct list_head splice;
@@ -9489,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = start_delalloc_inodes(root, nr, false);
+ ret = start_delalloc_inodes(root, &nr, false);
btrfs_put_root(root);
if (ret < 0)
goto out;
-
- if (nr != -1) {
- nr -= ret;
- WARN_ON(nr < 0);
- }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
@@ -9568,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
@@ -9633,11 +9569,15 @@ out_unlock:
return err;
}
-static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
+static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_trans_handle *trans_in,
struct inode *inode, struct btrfs_key *ins,
u64 file_offset)
{
struct btrfs_file_extent_item stack_fi;
+ struct btrfs_replace_extent_info extent_info;
+ struct btrfs_trans_handle *trans = trans_in;
+ struct btrfs_path *path;
u64 start = ins->objectid;
u64 len = ins->offset;
int ret;
@@ -9654,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len);
if (ret < 0)
- return ret;
- return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset,
- &stack_fi, ret);
+ return ERR_PTR(ret);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, BTRFS_I(inode),
+ file_offset, &stack_fi, ret);
+ if (ret)
+ return ERR_PTR(ret);
+ return trans;
+ }
+
+ extent_info.disk_offset = start;
+ extent_info.disk_len = len;
+ extent_info.data_offset = 0;
+ extent_info.data_len = len;
+ extent_info.file_offset = file_offset;
+ extent_info.extent_buf = (char *)&stack_fi;
+ extent_info.is_new_extent = true;
+ extent_info.qgroup_reserved = ret;
+ extent_info.insertions = 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
+ file_offset + len - 1, &extent_info,
+ &trans);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return trans;
}
+
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint,
@@ -9680,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (trans)
own_trans = false;
while (num_bytes > 0) {
- if (own_trans) {
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- break;
- }
- }
-
cur_bytes = min_t(u64, num_bytes, SZ_256M);
cur_bytes = max(cur_bytes, min_size);
/*
@@ -9699,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
min_size, 0, *alloc_hint, &ins, 1, 0);
- if (ret) {
- if (own_trans)
- btrfs_end_transaction(trans);
+ if (ret)
break;
- }
/*
* We've reserved this space, and thus converted it from
@@ -9716,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
- ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
- if (ret) {
+ trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, ret);
- if (own_trans)
- btrfs_end_transaction(trans);
break;
}
@@ -9785,8 +9742,10 @@ next:
break;
}
- if (own_trans)
+ if (own_trans) {
btrfs_end_transaction(trans);
+ trans = NULL;
+ }
}
if (clear_offset < end)
btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
@@ -9865,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
if (ret)
@@ -10072,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
/*
* Balance or device remove/replace/resize can move stuff around from
- * under us. The EXCL_OP flag makes sure they aren't running/won't run
- * concurrently while we are mapping the swap extents, and
- * fs_info->swapfile_pins prevents them from running while the swap file
- * is active and moving the extents. Note that this also prevents a
- * concurrent device add which isn't actually necessary, but it's not
+ * under us. The exclop protection makes sure they aren't running/won't
+ * run concurrently while we are mapping the swap extents, and
+ * fs_info->swapfile_pins prevents them from running while the swap
+ * file is active and moving the extents. Note that this also prevents
+ * a concurrent device add which isn't actually necessary, but it's not
* really worth the trouble to allow it.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
btrfs_warn(fs_info,
"cannot activate swapfile while exclusive operation is running");
return -EBUSY;
@@ -10225,7 +10183,7 @@ out:
if (ret)
btrfs_swap_deactivate(file);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (ret)
return ret;
@@ -10283,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = {
.fsync = btrfs_sync_file,
};
-static const struct extent_io_ops btrfs_extent_io_ops = {
- /* mandatory callbacks */
- .submit_bio_hook = btrfs_submit_bio_hook,
- .readpage_end_io_hook = btrfs_readpage_end_io_hook,
-};
-
/*
* btrfs doesn't support the bmap operation because swapfiles
* use bmap to make a mapping of extents in the file. They assume
@@ -10306,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = {
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
.readahead = btrfs_readahead,
- .direct_IO = btrfs_direct_IO,
+ .direct_IO = noop_direct_IO,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
#ifdef CONFIG_MIGRATION
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index bd3511c5ca81..ab408a23ba32 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags)
return 0;
}
+bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
+ enum btrfs_exclusive_operation type)
+{
+ return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type);
+}
+
+void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
+{
+ WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
+ sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
+}
+
/*
* Set the xflags from the internal inode flags. The remaining items of fsxattr
* are zeroed.
@@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
goto fail_free;
}
trans->block_rsv = &block_rsv;
@@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir,
if (ret)
goto fail;
- leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
+ leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
+ BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
@@ -742,7 +755,7 @@ fail:
kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
- btrfs_subvolume_release_metadata(fs_info, &block_rsv);
+ btrfs_subvolume_release_metadata(root, &block_rsv);
err = btrfs_commit_transaction(trans);
if (err && !ret)
@@ -856,7 +869,7 @@ fail:
if (ret && pending_snapshot->snap)
pending_snapshot->snap->anon_dev = 0;
btrfs_put_root(pending_snapshot->snap);
- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
+ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
free_pending:
if (pending_snapshot->anon_dev)
free_anon_bdev(pending_snapshot->anon_dev);
@@ -1306,7 +1319,7 @@ again:
break;
unlock_page(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
lock_page(page);
/*
@@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
@@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
out_free:
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
mnt_drop_write_file(file);
return ret;
}
@@ -2086,9 +2099,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
sh.len = item_len;
sh.transid = found_transid;
- /* copy search result header */
- if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
- ret = -EFAULT;
+ /*
+ * Copy search result header. If we fault then loop again so we
+ * can fault in the pages and -EFAULT there if there's a
+ * problem. Otherwise we'll fault and then copy the buffer in
+ * properly this next time through
+ */
+ if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = 0;
goto out;
}
@@ -2096,10 +2114,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
if (item_len) {
char __user *up = ubuf + *sk_offset;
- /* copy the item */
- if (read_extent_buffer_to_user(leaf, up,
- item_off, item_len)) {
- ret = -EFAULT;
+ /*
+ * Copy the item, same behavior as above, but reset the
+ * * sk_offset so we copy the full thing again.
+ */
+ if (read_extent_buffer_to_user_nofault(leaf, up,
+ item_off, item_len)) {
+ ret = 0;
+ *sk_offset -= sizeof(sh);
goto out;
}
@@ -2184,6 +2206,11 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
+ ret = fault_in_pages_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset);
+ if (ret)
+ break;
+
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
@@ -3112,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
vol_args = memdup_user(arg, sizeof(*vol_args));
@@ -3129,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -3158,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
@@ -3169,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
@@ -3200,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
if (ret)
return ret;
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
@@ -3218,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out_drop_write:
mnt_drop_write_file(file);
@@ -3448,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *tmp;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3504,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
break;
info = NULL;
- rcu_read_lock();
- list_for_each_entry_rcu(tmp, &fs_info->space_info,
- list) {
+ list_for_each_entry(tmp, &fs_info->space_info, list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
- rcu_read_unlock();
if (!info)
continue;
@@ -3722,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
ret = -EROFS;
goto out;
}
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
@@ -3937,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
return ret;
again:
- if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
+ if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
@@ -3983,7 +4004,6 @@ again:
}
locked:
- BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
@@ -4038,10 +4058,10 @@ locked:
do_balance:
/*
- * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
- * btrfs_balance. bctl is freed in reset_balance_state, or, if
- * restriper was paused all the way until unmount, in free_fs_info.
- * The flag should be cleared after reset_balance_state.
+ * Ownership of bctl and exclusive operation goes to btrfs_balance.
+ * bctl is freed in reset_balance_state, or, if restriper was paused
+ * all the way until unmount, in free_fs_info. The flag should be
+ * cleared after reset_balance_state.
*/
need_unlock = false;
@@ -4060,7 +4080,7 @@ out_bargs:
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
return ret;
@@ -4883,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SYNC: {
int ret;
- ret = btrfs_start_delalloc_roots(fs_info, -1);
+ ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index f75612e18a82..66e02ebdd340 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -57,8 +57,8 @@
* performance reasons.
*
*
- * Lock nesting
- * ------------
+ * Lock recursion
+ * --------------
*
* A write operation on a tree might indirectly start a look up on the same
* tree. This can happen when btrfs_cow_block locks the tree and needs to
@@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
btrfs_assert_tree_read_locked(eb);
atomic_inc(&eb->blocking_readers);
@@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
* lock, but it won't change to or away from us. If we have the write
* lock, we are the owner and it'll never change.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner)
+ if (eb->lock_recursed && current->pid == eb->lock_owner)
return;
if (eb->blocking_writers == 0) {
btrfs_assert_spinning_writers_put(eb);
@@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb)
*
* The rwlock is held upon exit.
*/
-void btrfs_tree_read_lock(struct extent_buffer *eb)
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse)
{
u64 start_ns = 0;
@@ -263,8 +264,9 @@ again:
* depends on this as it may be called on a partly
* (write-)locked tree.
*/
- BUG_ON(eb->lock_nested);
- eb->lock_nested = true;
+ WARN_ON(!recurse);
+ BUG_ON(eb->lock_recursed);
+ eb->lock_recursed = true;
read_unlock(&eb->lock);
trace_btrfs_tree_read_lock(eb, start_ns);
return;
@@ -279,6 +281,11 @@ again:
trace_btrfs_tree_read_lock(eb, start_ns);
}
+void btrfs_tree_read_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false);
+}
+
/*
* Lock extent buffer for read, optimistically expecting that there are no
* contending blocking writers. If there are, don't wait.
@@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
/*
* if we're nested, we have the write lock. No new locking
* is needed as long as we are the lock owner.
- * The write unlock will do a barrier for us, and the lock_nested
+ * The write unlock will do a barrier for us, and the lock_recursed
* field only matters to the lock owner.
*/
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = false;
+ if (eb->lock_recursed && current->pid == eb->lock_owner) {
+ eb->lock_recursed = false;
return;
}
btrfs_assert_tree_read_locked(eb);
@@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
*
* The rwlock is held for write upon exit.
*/
-void btrfs_tree_lock(struct extent_buffer *eb)
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
__acquires(&eb->lock)
{
u64 start_ns = 0;
@@ -434,6 +441,11 @@ again:
trace_btrfs_tree_lock(eb, start_ns);
}
+void btrfs_tree_lock(struct extent_buffer *eb)
+{
+ __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL);
+}
+
/*
* Release the write lock, either blocking or spinning (ie. there's no need
* for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking).
@@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
*
* Return: root extent buffer with read lock held
*/
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
- btrfs_tree_read_lock(eb);
+ __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse);
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index d715846c10b8..3ea81ed3320b 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -16,11 +16,81 @@
#define BTRFS_WRITE_LOCK_BLOCKING 3
#define BTRFS_READ_LOCK_BLOCKING 4
+/*
+ * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at
+ * the time of this patch is 8, which is how many we use. Keep this in mind if
+ * you decide you want to add another subclass.
+ */
+enum btrfs_lock_nesting {
+ BTRFS_NESTING_NORMAL,
+
+ /*
+ * When we COW a block we are holding the lock on the original block,
+ * and since our lockdep maps are rootid+level, this confuses lockdep
+ * when we lock the newly allocated COW'd block. Handle this by having
+ * a subclass for COW'ed blocks so that lockdep doesn't complain.
+ */
+ BTRFS_NESTING_COW,
+
+ /*
+ * Oftentimes we need to lock adjacent nodes on the same level while
+ * still holding the lock on the original node we searched to, such as
+ * for searching forward or for split/balance.
+ *
+ * Because of this we need to indicate to lockdep that this is
+ * acceptable by having a different subclass for each of these
+ * operations.
+ */
+ BTRFS_NESTING_LEFT,
+ BTRFS_NESTING_RIGHT,
+
+ /*
+ * When splitting we will be holding a lock on the left/right node when
+ * we need to cow that node, thus we need a new set of subclasses for
+ * these two operations.
+ */
+ BTRFS_NESTING_LEFT_COW,
+ BTRFS_NESTING_RIGHT_COW,
+
+ /*
+ * When splitting we may push nodes to the left or right, but still use
+ * the subsequent nodes in our path, keeping our locks on those adjacent
+ * blocks. Thus when we go to allocate a new split block we've already
+ * used up all of our available subclasses, so this subclass exists to
+ * handle this case where we need to allocate a new split block.
+ */
+ BTRFS_NESTING_SPLIT,
+
+ /*
+ * When promoting a new block to a root we need to have a special
+ * subclass so we don't confuse lockdep, as it will appear that we are
+ * locking a higher level node before a lower level one. Copying also
+ * has this problem as it appears we're locking the same block again
+ * when we make a snapshot of an existing root.
+ */
+ BTRFS_NESTING_NEW_ROOT,
+
+ /*
+ * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so
+ * add this in here and add a static_assert to keep us from going over
+ * the limit. As of this writing we're limited to 8, and we're
+ * definitely using 8, hence this check to keep us from messing up in
+ * the future.
+ */
+ BTRFS_NESTING_MAX,
+};
+
+static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
+ "too many lock subclasses defined");
+
struct btrfs_path;
+void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest);
void btrfs_tree_lock(struct extent_buffer *eb);
void btrfs_tree_unlock(struct extent_buffer *eb);
+void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest,
+ bool recurse);
void btrfs_tree_read_lock(struct extent_buffer *eb);
void btrfs_tree_read_unlock(struct extent_buffer *eb);
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
@@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
+struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root,
+ bool recurse);
+
+static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+{
+ return __btrfs_read_lock_root_node(root, false);
+}
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ebac13389e7e..87bac9ecdf4c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset
refcount_set(&entry->refs, 1);
init_waitqueue_head(&entry->wait);
INIT_LIST_HEAD(&entry->list);
+ INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
init_completion(&entry->completion);
- trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry);
+ trace_btrfs_ordered_extent_add(inode, entry);
spin_lock_irq(&tree->lock);
node = tree_insert(&tree->tree, file_offset,
@@ -377,17 +378,16 @@ out:
* test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
* to make sure this function only returns 1 once for a given ordered extent.
*/
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate)
{
- struct btrfs_ordered_inode_tree *tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
unsigned long flags;
int ret;
- tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irqsave(&tree->lock, flags);
if (cached && *cached) {
entry = *cached;
@@ -408,7 +408,7 @@ have_entry:
}
if (io_size > entry->bytes_left) {
- btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ btrfs_crit(inode->root->fs_info,
"bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
}
@@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
struct list_head *cur;
struct btrfs_ordered_sum *sum;
- trace_btrfs_ordered_extent_put(entry->inode, entry);
+ trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
if (refcount_dec_and_test(&entry->refs)) {
ASSERT(list_empty(&entry->root_extent_list));
+ ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
@@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
* remove an ordered extent from the tree. No references are dropped
* and waiters are woken up.
*/
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_inode_tree *tree;
- struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
struct btrfs_root *root = btrfs_inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *node;
+ bool pending;
/* This is paired with btrfs_add_ordered_extent. */
spin_lock(&btrfs_inode->lock);
@@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (pending) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&fs_info->trans_lock);
+ trans = fs_info->running_transaction;
+ if (trans)
+ refcount_inc(&trans->use_count);
+ spin_unlock(&fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
- trace_btrfs_ordered_extent_remove(inode, entry);
+ trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
if (!root->nr_ordered_extents) {
spin_lock(&fs_info->ordered_root_lock);
@@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
struct btrfs_ordered_extent *ordered;
ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
- btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
complete(&ordered->completion);
}
@@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
* in the extent, and it waits on the io completion code to insert
* metadata into the btree corresponding to the extent
*/
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry,
- int wait)
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
+ struct btrfs_inode *inode = BTRFS_I(entry->inode);
trace_btrfs_ordered_extent_start(inode, entry);
@@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
* for the flusher thread to find them
*/
if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->i_mapping, start, end);
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
end = orig_end;
while (1) {
- ordered = btrfs_lookup_first_ordered_extent(inode, end);
+ ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
if (!ordered)
break;
if (ordered->file_offset > orig_end) {
@@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
end = ordered->file_offset;
/*
* If the ordered extent had an error save the error but don't
@@ -775,17 +803,45 @@ out:
}
/*
+ * Adds all ordered extents to the given list. The list ends up sorted by the
+ * file_offset of the ordered extents.
+ */
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list)
+{
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
+ struct rb_node *n;
+
+ ASSERT(inode_is_locked(&inode->vfs_inode));
+
+ spin_lock_irq(&tree->lock);
+ for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+
+ if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ continue;
+
+ ASSERT(list_empty(&ordered->log_list));
+ list_add_tail(&ordered->log_list, list);
+ refcount_inc(&ordered->refs);
+ }
+ spin_unlock_irq(&tree->lock);
+}
+
+/*
* lookup and return any extent before 'file_offset'. NULL is returned
* if none is found
*/
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
{
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry = NULL;
- tree = &BTRFS_I(inode)->ordered_tree;
+ tree = &inode->ordered_tree;
spin_lock_irq(&tree->lock);
node = tree_search(tree, file_offset);
if (!node)
@@ -803,20 +859,21 @@ out:
* try to find a checksum. This is used because we allow pages to
* be reclaimed before their checksum is actually put into the btree
*/
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len)
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_ordered_sum *ordered_sum;
struct btrfs_ordered_extent *ordered;
- struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+ struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
unsigned long num_sectors;
unsigned long i;
u32 sectorsize = btrfs_inode_sectorsize(inode);
+ const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits;
const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int index = 0;
- ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset);
+ ordered = btrfs_lookup_ordered_extent(inode, offset);
if (!ordered)
return 0;
@@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
if (disk_bytenr >= ordered_sum->bytenr &&
disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
- i = (disk_bytenr - ordered_sum->bytenr) >>
- inode->i_sb->s_blocksize_bits;
- num_sectors = ordered_sum->len >>
- inode->i_sb->s_blocksize_bits;
+ i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits;
+ num_sectors = ordered_sum->len >> blocksize_bits;
num_sectors = min_t(int, len - index, num_sectors - i);
memcpy(sum + index, ordered_sum->sums + i * csum_size,
num_sectors * csum_size);
@@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
break;
}
unlock_extent_cached(&inode->io_tree, start, end, cachedp);
- btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+ btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
}
}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d61ea9c880a3..c3a2325e64a4 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -56,6 +56,12 @@ enum {
BTRFS_ORDERED_TRUNCATED,
/* Regular IO for COW */
BTRFS_ORDERED_REGULAR,
+ /* Used during fsync to track already logged extents */
+ BTRFS_ORDERED_LOGGED,
+ /* We have already logged all the csums of the ordered extent */
+ BTRFS_ORDERED_LOGGED_CSUM,
+ /* We wait for this extent to complete in the current transaction */
+ BTRFS_ORDERED_PENDING,
};
struct btrfs_ordered_extent {
@@ -104,6 +110,9 @@ struct btrfs_ordered_extent {
/* list of checksums for insertion when the extent io is done */
struct list_head list;
+ /* used for fast fsyncs */
+ struct list_head log_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
@@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
}
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-void btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
struct btrfs_ordered_extent *entry);
-int btrfs_dec_test_ordered_pending(struct inode *inode,
+int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
struct btrfs_ordered_extent **cached,
u64 file_offset, u64 io_size, int uptodate);
int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
@@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct inode *inode,
- struct btrfs_ordered_extent *entry, int wait);
+void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait);
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
-btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
struct btrfs_inode *inode,
u64 file_offset,
u64 len);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
- u8 *sum, int len);
+void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
+ struct list_head *list);
+int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset,
+ u64 disk_bytenr, u8 *sum, int len);
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
const u64 range_start, const u64 range_len);
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 61f44e78e3c9..7695c4783d33 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -7,6 +7,44 @@
#include "disk-io.h"
#include "print-tree.h"
+struct root_name_map {
+ u64 id;
+ char name[16];
+};
+
+static const struct root_name_map root_map[] = {
+ { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" },
+ { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" },
+ { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" },
+ { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" },
+ { BTRFS_FS_TREE_OBJECTID, "FS_TREE" },
+ { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" },
+ { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" },
+ { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" },
+ { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" },
+ { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" },
+ { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" },
+};
+
+const char *btrfs_root_name(u64 objectid, char *buf)
+{
+ int i;
+
+ if (objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN,
+ "TREE_RELOC offset=%llu", objectid);
+ return buf;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(root_map); i++) {
+ if (root_map[i].id == objectid)
+ return root_map[i].name;
+ }
+
+ snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid);
+ return buf;
+}
+
static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
{
int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
@@ -95,9 +133,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* offset is supposed to be a tree block which
* must be aligned to nodesize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -112,8 +151,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* must be aligned to nodesize.
*/
if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index e6bb38fd75ad..78b99385a503 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,7 +6,11 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+/* Buffer size to contain tree name and possibly additional data (offset) */
+#define BTRFS_ROOT_NAME_BUF_LEN 48
+
void btrfs_print_leaf(struct extent_buffer *l);
void btrfs_print_tree(struct extent_buffer *c, bool follow);
+const char *btrfs_root_name(u64 objectid, char *buf);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c0f350c3a0cf..580899bdb991 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 243a2e44526e..9d4f5316a7e8 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work)
kfree(rmw);
}
-static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+/* Try to start up to 10k READA requests for a group of devices */
+static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_device *device;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 enqueued;
u64 total = 0;
- int i;
+ struct btrfs_device *device;
-again:
do {
enqueued = 0;
- mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (atomic_read(&device->reada_in_flight) <
MAX_IN_FLIGHT)
enqueued += reada_start_machine_dev(device);
}
- mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+ return total;
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
+ int i;
+ u64 enqueued = 0;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+
+ enqueued += reada_start_for_fsdevs(fs_devices);
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
+ enqueued += reada_start_for_fsdevs(seed_devs);
+
+ mutex_unlock(&fs_devices->device_list_mutex);
if (enqueued == 0)
return;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 5cd02514cf4d..99aa87c08912 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -45,7 +45,7 @@ out:
return ret;
}
-static int copy_inline_to_page(struct inode *inode,
+static int copy_inline_to_page(struct btrfs_inode *inode,
const u64 file_offset,
char *inline_data,
const u64 size,
@@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode,
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
struct extent_changeset *data_reserved = NULL;
struct page *page = NULL;
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
int ret;
ASSERT(IS_ALIGNED(file_offset, block_size));
@@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode,
* reservation here. Also we must not do the reservation while holding
* a transaction open, otherwise we would deadlock.
*/
- ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- file_offset, block_size);
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
+ block_size);
if (ret)
goto out;
- page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
- btrfs_alloc_write_mask(inode->i_mapping));
+ page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
+ btrfs_alloc_write_mask(mapping));
if (!page) {
ret = -ENOMEM;
goto out_unlock;
}
set_page_extent_mapped(page);
- clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
+ clear_extent_bit(&inode->io_tree, file_offset, range_end,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
0, 0, NULL);
- ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
- 0, NULL);
+ ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -134,9 +134,9 @@ out_unlock:
put_page(page);
}
if (ret)
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
- file_offset, block_size, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
+ btrfs_delalloc_release_space(inode, data_reserved, file_offset,
+ block_size, true);
+ btrfs_delalloc_release_extents(inode, block_size);
out:
extent_changeset_free(data_reserved);
@@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(dst, new_key->offset,
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
inline_data, size, datal,
comp_type);
goto out;
@@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -231,8 +231,8 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(dst, new_key->offset, inline_data,
- size, datal, comp_type);
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
goto out;
}
@@ -439,7 +439,7 @@ process_slot:
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
- struct btrfs_clone_extent_info clone_info;
+ struct btrfs_replace_extent_info clone_info;
/*
* a | --- range to clone ---| b
@@ -462,8 +462,8 @@ process_slot:
clone_info.data_len = datal;
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
- clone_info.item_size = size;
- ret = btrfs_punch_hole_range(inode, path, drop_start,
+ clone_info.is_new_extent = false;
+ ret = btrfs_replace_file_extents(inode, path, drop_start,
new_key.offset + datal - 1, &clone_info,
&trans);
if (ret)
@@ -520,6 +520,8 @@ process_slot:
ret = -EINTR;
goto out;
}
+
+ cond_resched();
}
ret = 0;
@@ -533,7 +535,7 @@ process_slot:
btrfs_release_path(path);
path->leave_spinning = 0;
- ret = btrfs_punch_hole_range(inode, path, last_dest_end,
+ ret = btrfs_replace_file_extents(inode, path, last_dest_end,
destoff + len - 1, NULL, &trans);
if (ret)
goto out;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4ba1ab9cc76d..3602806d71bd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1206,7 +1206,8 @@ again:
}
if (cow) {
- ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+ ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1274,7 +1275,8 @@ again:
btrfs_tree_lock(eb);
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, parent,
- slot, &eb);
+ slot, &eb,
+ BTRFS_NESTING_COW);
BUG_ON(ret);
}
btrfs_set_lock_blocking_write(eb);
@@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
* relocated and the block is tree root.
*/
leaf = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+ ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
+ BTRFS_NESTING_COW);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
if (ret < 0)
@@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
if (!node->eb) {
ret = btrfs_cow_block(trans, root, eb, upper->eb,
- slot, &eb);
+ slot, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
if (ret < 0) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index c89697486366..702dc5441f03 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret && qgroup_num_bytes)
btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
+ if (!ret) {
+ spin_lock(&rsv->lock);
+ rsv->qgroup_rsv_reserved += qgroup_num_bytes;
+ spin_unlock(&rsv->lock);
+ }
return ret;
}
-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv)
{
- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 qgroup_to_release;
+
+ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
+ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5a6cb9db512e..cf63f1e27a27 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
int success;
bool full_stripe_locked;
unsigned int nofs_flag;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
BUG_ON(sblock_to_check->page_count < 1);
@@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("i/o error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
} else if (sblock_bad->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.csum_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum error", sblock_to_check);
btrfs_dev_stat_inc_and_print(dev,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
spin_lock(&sctx->stat_lock);
sctx->stat.verify_errors++;
spin_unlock(&sctx->stat_lock);
- if (__ratelimit(&_rs))
+ if (__ratelimit(&rs))
scrub_print_warning("checksum/header error",
sblock_to_check);
if (sblock_bad->generation_error)
@@ -3716,50 +3716,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
return 0;
}
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
+ &fs_info->scrub_lock)) {
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
+
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
+ }
+}
+
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
+ int ret = -ENOMEM;
- lockdep_assert_held(&fs_info->scrub_lock);
+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
+ return 0;
- if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
- ASSERT(fs_info->scrub_workers == NULL);
- fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
- flags, is_dev_replace ? 1 : max_active, 4);
- if (!fs_info->scrub_workers)
- goto fail_scrub_workers;
-
- ASSERT(fs_info->scrub_wr_completion_workers == NULL);
- fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
- if (!fs_info->scrub_wr_completion_workers)
- goto fail_scrub_wr_completion_workers;
+ scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
+ is_dev_replace ? 1 : max_active, 4);
+ if (!scrub_workers)
+ goto fail_scrub_workers;
- ASSERT(fs_info->scrub_parity_workers == NULL);
- fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
- if (!fs_info->scrub_parity_workers)
- goto fail_scrub_parity_workers;
+ if (!scrub_wr_comp)
+ goto fail_scrub_wr_completion_workers;
+ scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ max_active, 2);
+ if (!scrub_parity)
+ goto fail_scrub_parity_workers;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL &&
+ fs_info->scrub_wr_completion_workers == NULL &&
+ fs_info->scrub_parity_workers == NULL);
+ fs_info->scrub_workers = scrub_workers;
+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+ fs_info->scrub_parity_workers = scrub_parity;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
- } else {
- refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ return 0;
}
- return 0;
+ /* Other thread raced in and created the workers for us */
+ refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ ret = 0;
+ btrfs_destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(scrub_workers);
fail_scrub_workers:
- return -ENOMEM;
+ return ret;
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3770,9 +3804,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3819,13 +3850,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (IS_ERR(sctx))
return PTR_ERR(sctx);
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret)
+ goto out_free_ctx;
+
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -ENODEV;
- goto out_free_ctx;
+ goto out;
}
if (!is_dev_replace && !readonly &&
@@ -3834,7 +3869,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
ret = -EROFS;
- goto out_free_ctx;
+ goto out;
}
mutex_lock(&fs_info->scrub_lock);
@@ -3843,7 +3878,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
- goto out_free_ctx;
+ goto out;
}
down_read(&fs_info->dev_replace.rwsem);
@@ -3854,17 +3889,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EINPROGRESS;
- goto out_free_ctx;
+ goto out;
}
up_read(&fs_info->dev_replace.rwsem);
- ret = scrub_workers_get(fs_info, is_dev_replace);
- if (ret) {
- mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- goto out_free_ctx;
- }
-
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3917,24 +3945,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
-
- fs_info->scrub_workers = NULL;
- fs_info->scrub_wr_completion_workers = NULL;
- fs_info->scrub_parity_workers = NULL;
- }
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
return ret;
-
+out:
+ scrub_workers_put(fs_info);
out_free_ctx:
scrub_free_ctx(sctx);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d9813a5b075a..340c76a12ce1 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -122,8 +122,6 @@ struct send_ctx {
struct file_ra_state ra;
- char *read_buf;
-
/*
* We process inodes by their increasing order, so if before an
* incremental send we reverse the parent/child relationship of
@@ -278,11 +276,6 @@ enum btrfs_compare_tree_result {
BTRFS_COMPARE_TREE_CHANGED,
BTRFS_COMPARE_TREE_SAME,
};
-typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path,
- struct btrfs_path *right_path,
- struct btrfs_key *key,
- enum btrfs_compare_tree_result result,
- void *ctx);
__cold
static void inconsistent_snapshot_error(struct send_ctx *sctx,
@@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return -EOVERFLOW;
hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
- hdr->tlv_type = cpu_to_le16(attr);
- hdr->tlv_len = cpu_to_le16(len);
+ put_unaligned_le16(attr, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
memcpy(hdr + 1, data, len);
sctx->send_size += total_len;
@@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
sctx->send_size += sizeof(*hdr);
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->cmd = cpu_to_le16(cmd);
+ put_unaligned_le16(cmd, &hdr->cmd);
return 0;
}
@@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx)
u32 crc;
hdr = (struct btrfs_cmd_header *)sctx->send_buf;
- hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
- hdr->crc = 0;
+ put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
+ put_unaligned_le32(0, &hdr->crc);
crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
- hdr->crc = cpu_to_le32(crc);
+ put_unaligned_le32(crc, &hdr->crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
&sctx->send_off);
sctx->total_send_size += sctx->send_size;
- sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
+ sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size;
sctx->send_size = 0;
return ret;
@@ -3813,6 +3806,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
}
/*
+ * When processing the new references for an inode we may orphanize an existing
+ * directory inode because its old name conflicts with one of the new references
+ * of the current inode. Later, when processing another new reference of our
+ * inode, we might need to orphanize another inode, but the path we have in the
+ * reference reflects the pre-orphanization name of the directory we previously
+ * orphanized. For example:
+ *
+ * parent snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- f1 (ino 257)
+ * |----- f2 (ino 258)
+ * |----- d1/ (ino 259)
+ * |----- d2/ (ino 260)
+ *
+ * send snapshot looks like:
+ *
+ * . (ino 256)
+ * |----- d1 (ino 258)
+ * |----- f2/ (ino 259)
+ * |----- f2_link/ (ino 260)
+ * | |----- f1 (ino 257)
+ * |
+ * |----- d2 (ino 258)
+ *
+ * When processing inode 257 we compute the name for inode 259 as "d1", and we
+ * cache it in the name cache. Later when we start processing inode 258, when
+ * collecting all its new references we set a full path of "d1/d2" for its new
+ * reference with name "d2". When we start processing the new references we
+ * start by processing the new reference with name "d1", and this results in
+ * orphanizing inode 259, since its old reference causes a conflict. Then we
+ * move on the next new reference, with name "d2", and we find out we must
+ * orphanize inode 260, as its old reference conflicts with ours - but for the
+ * orphanization we use a source path corresponding to the path we stored in the
+ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
+ * receiver fail since the path component "d1/" no longer exists, it was renamed
+ * to "o259-6-0/" when processing the previous new reference. So in this case we
+ * must recompute the path in the new reference and use it for the new
+ * orphanization operation.
+ */
+static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
+{
+ char *name;
+ int ret;
+
+ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ fs_path_reset(ref->full_path);
+ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
+ if (ret < 0)
+ goto out;
+
+ ret = fs_path_add(ref->full_path, name, ref->name_len);
+ if (ret < 0)
+ goto out;
+
+ /* Update the reference's base name pointer. */
+ set_ref_path(ref, ref->full_path);
+out:
+ kfree(name);
+ return ret;
+}
+
+/*
* This does all the move/link/unlink/rmdir magic.
*/
static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
@@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
goto out;
}
+ /*
+ * Before doing any rename and link operations, do a first pass on the
+ * new references to orphanize any unprocessed inodes that may have a
+ * reference that conflicts with one of the new references of the current
+ * inode. This needs to happen first because a new reference may conflict
+ * with the old reference of a parent directory, so we must make sure
+ * that the path used for link and rename commands don't use an
+ * orphanized name when an ancestor was not yet orphanized.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir/ (ino 259)
+ * | |----- a (ino 257)
+ * |
+ * |----- b (ino 258)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |----- testdir_2/ (ino 259)
+ * | |----- a (ino 260)
+ * |
+ * |----- testdir (ino 257)
+ * |----- b (ino 257)
+ * |----- b2 (ino 258)
+ *
+ * Processing the new reference for inode 257 with name "b" may happen
+ * before processing the new reference with name "testdir". If so, we
+ * must make sure that by the time we send a link command to create the
+ * hard link "b", inode 259 was already orphanized, since the generated
+ * path in "valid_path" already contains the orphanized name for 259.
+ * We are processing inode 257, so only later when processing 259 we do
+ * the rename operation to change its temporary (orphanized) name to
+ * "testdir_2".
+ */
list_for_each_entry(cur, &sctx->new_refs, list) {
- /*
- * We may have refs where the parent directory does not exist
- * yet. This happens if the parent directories inum is higher
- * than the current inum. To handle this case, we create the
- * parent directory out of order. But we need to check if this
- * did already happen before due to other refs in the same dir.
- */
ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- if (ret == inode_state_will_create) {
- ret = 0;
- /*
- * First check if any of the current inodes refs did
- * already create the dir.
- */
- list_for_each_entry(cur2, &sctx->new_refs, list) {
- if (cur == cur2)
- break;
- if (cur2->dir == cur->dir) {
- ret = 1;
- break;
- }
- }
-
- /*
- * If that did not happen, check if a previous inode
- * did already create the dir.
- */
- if (!ret)
- ret = did_create_dir(sctx, cur->dir);
- if (ret < 0)
- goto out;
- if (!ret) {
- ret = send_create_inode(sctx, cur->dir);
- if (ret < 0)
- goto out;
- }
- }
+ if (ret == inode_state_will_create)
+ continue;
/*
- * Check if this new ref would overwrite the first ref of
- * another unprocessed inode. If yes, orphanize the
- * overwritten inode. If we find an overwritten ref that is
- * not the first ref, simply unlink it.
+ * Check if this new ref would overwrite the first ref of another
+ * unprocessed inode. If yes, orphanize the overwritten inode.
+ * If we find an overwritten ref that is not the first ref,
+ * simply unlink it.
*/
ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
cur->name, cur->name_len,
@@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
struct name_cache_entry *nce;
struct waiting_dir_move *wdm;
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
+
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
@@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
}
}
+ }
+
+ list_for_each_entry(cur, &sctx->new_refs, list) {
+ /*
+ * We may have refs where the parent directory does not exist
+ * yet. This happens if the parent directories inum is higher
+ * than the current inum. To handle this case, we create the
+ * parent directory out of order. But we need to check if this
+ * did already happen before due to other refs in the same dir.
+ */
+ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ if (ret == inode_state_will_create) {
+ ret = 0;
+ /*
+ * First check if any of the current inodes refs did
+ * already create the dir.
+ */
+ list_for_each_entry(cur2, &sctx->new_refs, list) {
+ if (cur == cur2)
+ break;
+ if (cur2->dir == cur->dir) {
+ ret = 1;
+ break;
+ }
+ }
+
+ /*
+ * If that did not happen, check if a previous inode
+ * did already create the dir.
+ */
+ if (!ret)
+ ret = did_create_dir(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ if (!ret) {
+ ret = send_create_inode(sctx, cur->dir);
+ if (ret < 0)
+ goto out;
+ }
+ }
+
if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
if (ret < 0)
@@ -4799,7 +4911,25 @@ out:
return ret;
}
-static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+static inline u64 max_send_read_size(const struct send_ctx *sctx)
+{
+ return sctx->send_max_size - SZ_16K;
+}
+
+static int put_data_header(struct send_ctx *sctx, u32 len)
+{
+ struct btrfs_tlv_header *hdr;
+
+ if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
+ return -EOVERFLOW;
+ hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
+ put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
+ put_unaligned_le16(len, &hdr->tlv_len);
+ sctx->send_size += sizeof(*hdr);
+ return 0;
+}
+
+static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
unsigned pg_offset = offset_in_page(offset);
- ssize_t ret = 0;
+ int ret;
+
+ ret = put_data_header(sctx, len);
+ if (ret)
+ return ret;
inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (offset + len > i_size_read(inode)) {
- if (offset > i_size_read(inode))
- len = 0;
- else
- len = offset - i_size_read(inode);
- }
- if (len == 0)
- goto out;
-
last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
@@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
}
addr = kmap(page);
- memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset,
+ cur_len);
kunmap(page);
unlock_page(page);
put_page(page);
index++;
pg_offset = 0;
len -= cur_len;
- ret += cur_len;
+ sctx->send_size += cur_len;
}
-out:
iput(inode);
return ret;
}
@@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
@@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
- num_read = fill_read_buf(sctx, offset, len);
- if (num_read <= 0) {
- if (num_read < 0)
- ret = num_read;
- goto out;
- }
-
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
goto out;
@@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
+ ret = put_file_data(sctx, offset, len);
+ if (ret < 0)
+ goto out;
ret = send_cmd(sctx);
tlv_put_failure:
out:
fs_path_free(p);
- if (ret < 0)
- return ret;
- return num_read;
+ return ret;
}
/*
@@ -5033,8 +5150,8 @@ out:
static int send_hole(struct send_ctx *sctx, u64 end)
{
struct fs_path *p = NULL;
+ u64 read_size = max_send_read_size(sctx);
u64 offset = sctx->cur_inode_last_extent;
- u64 len;
int ret = 0;
/*
@@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end)
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
if (ret < 0)
goto tlv_put_failure;
- memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
while (offset < end) {
- len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+ u64 len = min(end - offset, read_size);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
break;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
- TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = put_data_header(sctx, len);
+ if (ret < 0)
+ break;
+ memset(sctx->send_buf + sctx->send_size, 0, len);
+ sctx->send_size += len;
ret = send_cmd(sctx);
if (ret < 0)
break;
@@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx,
const u64 offset,
const u64 len)
{
+ u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
while (sent < len) {
- u64 size = len - sent;
+ u64 size = min(len - sent, read_size);
int ret;
- if (size > BTRFS_SEND_READ_SIZE)
- size = BTRFS_SEND_READ_SIZE;
ret = send_write(sctx, offset + sent, size);
if (ret < 0)
return ret;
- if (!ret)
- break;
- sent += ret;
+ sent += size;
}
return 0;
}
@@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx,
struct clone_root *clone_root)
{
int ret = 0;
- struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 len;
- u8 type;
+ u64 end;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
- ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
- type = btrfs_file_extent_type(path->nodes[0], ei);
- if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
- /*
- * it is possible the inline item won't cover the whole page,
- * but there may be items after this page. Make
- * sure to send the whole thing
- */
- len = PAGE_ALIGN(len);
- } else {
- len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
- }
-
- if (offset >= sctx->cur_inode_size) {
- ret = 0;
- goto out;
- }
- if (offset + len > sctx->cur_inode_size)
- len = sctx->cur_inode_size - offset;
- if (len == 0) {
- ret = 0;
- goto out;
- }
+ end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
+ if (offset >= end)
+ return 0;
- if (clone_root && IS_ALIGNED(offset + len, bs)) {
+ if (clone_root && IS_ALIGNED(end, bs)) {
+ struct btrfs_file_extent_item *ei;
u64 disk_byte;
u64 data_offset;
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
ret = clone_range(sctx, clone_root, disk_byte, data_offset,
- offset, len);
+ offset, end - offset);
} else {
- ret = send_extent_data(sctx, offset, len);
+ ret = send_extent_data(sctx, offset, end - offset);
}
- sctx->cur_inode_next_write_offset = offset + len;
-out:
+ sctx->cur_inode_next_write_offset = end;
return ret;
}
@@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path,
* If it detects a change, it aborts immediately.
*/
static int btrfs_compare_trees(struct btrfs_root *left_root,
- struct btrfs_root *right_root,
- btrfs_changed_cb_t changed_cb, void *ctx)
+ struct btrfs_root *right_root, void *ctx)
{
struct btrfs_fs_info *fs_info = left_root->fs_info;
int ret;
@@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx)
goto out;
if (sctx->parent_root) {
- ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
- changed_cb, sctx);
+ ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
if (ret < 0)
goto out;
ret = finish_inode_if_needed(sctx, 1);
@@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
u32 i;
u64 *clone_sources_tmp = NULL;
int clone_sources_to_rollback = 0;
- unsigned alloc_size;
+ size_t alloc_size;
int sort_clone_roots = 0;
if (!capable(CAP_SYS_ADMIN))
@@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
- if (!sctx->read_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
sctx->pending_dir_moves = RB_ROOT;
sctx->waiting_dir_moves = RB_ROOT;
sctx->orphan_dirs = RB_ROOT;
- alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
-
- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
+ sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
+ arg->clone_sources_count + 1,
+ GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
goto out;
}
- alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
+ alloc_size = array_size(sizeof(*arg->clone_sources),
+ arg->clone_sources_count);
if (arg->clone_sources_count) {
clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
@@ -7378,7 +7466,6 @@ out:
kvfree(sctx->clone_roots);
kvfree(sctx->send_buf);
- kvfree(sctx->read_buf);
name_cache_free(sctx);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index ead397f7034f..de91488b7cd0 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -13,7 +13,6 @@
#define BTRFS_SEND_STREAM_VERSION 1
#define BTRFS_SEND_BUF_SIZE SZ_64K
-#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
enum btrfs_tlv_type {
BTRFS_TLV_U8,
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 475968ccbd1d..64099565ab8f 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
struct list_head *head = &info->space_info;
struct btrfs_space_info *found;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list)
+ list_for_each_entry(found, head, list)
found->full = 0;
- rcu_read_unlock();
}
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
@@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
if (ret)
return ret;
- list_add_rcu(&space_info->list, &info->space_info);
+ list_add(&space_info->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = space_info;
@@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
- rcu_read_lock();
- list_for_each_entry_rcu(found, head, list) {
- if (found->flags & flags) {
- rcu_read_unlock();
+ list_for_each_entry(found, head, list) {
+ if (found->flags & flags)
return found;
- }
}
- rcu_read_unlock();
return NULL;
}
-static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
-{
- return (global->size << 1);
-}
-
static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
enum btrfs_reserve_flush_enum flush)
@@ -476,28 +465,6 @@ again:
up_read(&info->groups_sem);
}
-static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
- unsigned long nr_pages, int nr_items)
-{
- struct super_block *sb = fs_info->sb;
-
- if (down_read_trylock(&sb->s_umount)) {
- writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
- up_read(&sb->s_umount);
- } else {
- /*
- * We needn't worry the filesystem going from r/w to r/o though
- * we don't acquire ->s_umount mutex, because the filesystem
- * should guarantee the delalloc inodes list be empty after
- * the filesystem is readonly(all dirty pages are written to
- * the disk).
- */
- btrfs_start_delalloc_roots(fs_info, nr_items);
- if (!current->journal_info)
- btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
- }
-}
-
static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
u64 to_reclaim)
{
@@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
/*
* shrink metadata reservation for delalloc
*/
-static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
- u64 orig, bool wait_ordered)
+static void shrink_delalloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 to_reclaim, bool wait_ordered)
{
- struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
u64 delalloc_bytes;
u64 dio_bytes;
- u64 async_pages;
u64 items;
long time_left;
- unsigned long nr_pages;
int loops;
/* Calc the number of the pages we need flush for space reservation */
- items = calc_reclaim_items_nr(fs_info, to_reclaim);
- to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ if (to_reclaim == U64_MAX) {
+ items = U64_MAX;
+ } else {
+ /*
+ * to_reclaim is set to however much metadata we need to
+ * reclaim, but reclaiming that much data doesn't really track
+ * exactly, so increase the amount to reclaim by 2x in order to
+ * make sure we're flushing enough delalloc to hopefully reclaim
+ * some metadata reservations.
+ */
+ items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+ }
trans = (struct btrfs_trans_handle *)current->journal_info;
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
@@ -557,37 +532,17 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
loops = 0;
while ((delalloc_bytes || dio_bytes) && loops < 3) {
- nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
-
- /*
- * Triggers inode writeback for up to nr_pages. This will invoke
- * ->writepages callback and trigger delalloc filling
- * (btrfs_run_delalloc_range()).
- */
- btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
+ btrfs_start_delalloc_roots(fs_info, items);
- /*
- * We need to wait for the compressed pages to start before
- * we continue.
- */
- async_pages = atomic_read(&fs_info->async_delalloc_pages);
- if (!async_pages)
- goto skip_async;
-
- /*
- * Calculate how many compressed pages we want to be written
- * before we continue. I.e if there are more async pages than we
- * require wait_event will wait until nr_pages are written.
- */
- if (async_pages <= nr_pages)
- async_pages = 0;
- else
- async_pages -= nr_pages;
+ loops++;
+ if (wait_ordered && !trans) {
+ btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
+ } else {
+ time_left = schedule_timeout_killable(1);
+ if (time_left)
+ break;
+ }
- wait_event(fs_info->async_submit_wait,
- atomic_read(&fs_info->async_delalloc_pages) <=
- (int)async_pages);
-skip_async:
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets) &&
list_empty(&space_info->priority_tickets)) {
@@ -596,14 +551,6 @@ skip_async:
}
spin_unlock(&space_info->lock);
- loops++;
- if (wait_ordered && !trans) {
- btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
- } else {
- time_left = schedule_timeout_killable(1);
- if (time_left)
- break;
- }
delalloc_bytes = percpu_counter_sum_positive(
&fs_info->delalloc_bytes);
dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
@@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *trans;
- u64 bytes_needed;
u64 reclaim_bytes = 0;
+ u64 bytes_needed = 0;
u64 cur_free_bytes = 0;
trans = (struct btrfs_trans_handle *)current->journal_info;
@@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
else if (!list_empty(&space_info->tickets))
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- bytes_needed = (ticket) ? ticket->bytes : 0;
+ if (ticket)
+ bytes_needed = ticket->bytes;
if (bytes_needed > cur_free_bytes)
bytes_needed -= cur_free_bytes;
@@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
goto commit;
/*
- * See if there is some space in the delayed insertion reservation for
- * this reservation.
+ * See if there is some space in the delayed insertion reserve for this
+ * reservation. If the space_info's don't match (like for DATA or
+ * SYSTEM) then just go enospc, reclaiming this space won't recover any
+ * space to satisfy those reservations.
*/
if (space_info != delayed_rsv->space_info)
goto enospc;
@@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
+ shrink_delalloc(fs_info, space_info, num_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case FLUSH_DELAYED_REFS_NR:
@@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
break;
}
ret = btrfs_chunk_alloc(trans,
- btrfs_metadata_alloc_profile(fs_info),
+ btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
@@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
} while (flush_state <= COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+/*
+ * FLUSH_DELALLOC_WAIT:
+ * Space is freed from flushing delalloc in one of two ways.
+ *
+ * 1) compression is on and we allocate less space than we reserved
+ * 2) we are overwriting existing space
+ *
+ * For #1 that extra space is reclaimed as soon as the delalloc pages are
+ * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
+ * length to ->bytes_reserved, and subtracts the reserved space from
+ * ->bytes_may_use.
+ *
+ * For #2 this is trickier. Once the ordered extent runs we will drop the
+ * extent in the range we are overwriting, which creates a delayed ref for
+ * that freed extent. This however is not reclaimed until the transaction
+ * commits, thus the next stages.
+ *
+ * RUN_DELAYED_IPUTS
+ * If we are freeing inodes, we want to make sure all delayed iputs have
+ * completed, because they could have been on an inode with i_nlink == 0, and
+ * thus have been truncated and freed up space. But again this space is not
+ * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * run and then the transaction must be committed.
+ *
+ * FLUSH_DELAYED_REFS
+ * The above two cases generate delayed refs that will affect
+ * ->total_bytes_pinned. However this counter can be inconsistent with
+ * reality if there are outstanding delayed refs. This is because we adjust
+ * the counter based solely on the current set of delayed refs and disregard
+ * any on-disk state which might include more refs. So for example, if we
+ * have an extent with 2 references, but we only drop 1, we'll see that there
+ * is a negative delayed ref count for the extent and assume that the space
+ * will be freed, and thus increase ->total_bytes_pinned.
+ *
+ * Running the delayed refs gives us the actual real view of what will be
+ * freed at the transaction commit time. This stage will not actually free
+ * space for us, it just makes sure that may_commit_transaction() has all of
+ * the information it needs to make the right decision.
+ *
+ * COMMIT_TRANS
+ * This is where we reclaim all of the pinned space generated by the previous
+ * two stages. We will not commit the transaction if we don't think we're
+ * likely to satisfy our request, which means if our current free space +
+ * total_bytes_pinned < reservation we will not commit. This is why the
+ * previous states are actually important, to make sure we know for sure
+ * whether committing the transaction will allow us to make progress.
+ *
+ * ALLOC_CHUNK_FORCE
+ * For data we start with alloc chunk force, however we could have been full
+ * before, and then the transaction commit could have freed new block groups,
+ * so if we now have space to allocate do the force chunk allocation.
+ */
+static const enum btrfs_flush_state data_flush_states[] = {
+ FLUSH_DELALLOC_WAIT,
+ RUN_DELAYED_IPUTS,
+ FLUSH_DELAYED_REFS,
+ COMMIT_TRANS,
+ ALLOC_CHUNK_FORCE,
+};
+
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 last_tickets_id;
+ int flush_state = 0;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ last_tickets_id = space_info->tickets_id;
+ spin_unlock(&space_info->lock);
+ }
+
+ while (flush_state < ARRAY_SIZE(data_flush_states)) {
+ flush_space(fs_info, space_info, U64_MAX,
+ data_flush_states[flush_state]);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+
+ if (last_tickets_id == space_info->tickets_id) {
+ flush_state++;
+ } else {
+ last_tickets_id = space_info->tickets_id;
+ flush_state = 0;
+ }
+
+ if (flush_state >= ARRAY_SIZE(data_flush_states)) {
+ if (space_info->full) {
+ if (maybe_fail_all_tickets(fs_info, space_info))
+ flush_state = 0;
+ else
+ space_info->flush = 0;
+ } else {
+ flush_state = 0;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
+void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
+ INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
}
static const enum btrfs_flush_state priority_flush_states[] = {
@@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
} while (flush_state < states_nr);
}
+static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ while (!space_info->full) {
+ flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE);
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+ }
+}
+
static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
@@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
int ret;
switch (flush) {
+ case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
@@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
evict_flush_states,
ARRAY_SIZE(evict_flush_states));
break;
+ case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
+ priority_reclaim_data_space(fs_info, space_info, ticket);
+ break;
default:
ASSERT(0);
break;
@@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
+ struct work_struct *async_work;
struct reserve_ticket ticket;
u64 used;
int ret = 0;
@@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
ASSERT(orig_bytes);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ async_work = &fs_info->async_data_reclaim_work;
+ else
+ async_work = &fs_info->async_reclaim_work;
+
spin_lock(&space_info->lock);
ret = -ENOSPC;
used = btrfs_space_info_used(space_info, true);
@@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
init_waitqueue_head(&ticket.wait);
ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
if (flush == BTRFS_RESERVE_FLUSH_ALL ||
- flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
+ flush == BTRFS_RESERVE_FLUSH_DATA) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
space_info->flush = 1;
@@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
space_info->flags,
orig_bytes, flush,
"enospc");
- queue_work(system_unbound_wq,
- &fs_info->async_reclaim_work);
+ queue_work(system_unbound_wq, async_work);
}
} else {
list_add_tail(&ticket.list,
@@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush);
+ ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
@@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
}
return ret;
}
+
+/**
+ * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation
+ * @fs_info - the filesystem
+ * @bytes - the number of bytes we need
+ * @flush - how we are allowed to flush
+ *
+ * This will reserve bytes from the data space info. If there is not enough
+ * space then we will attempt to flush space as specified by flush.
+ */
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ int ret;
+
+ ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
+ flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
+
+ ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ if (ret == -ENOSPC) {
+ trace_btrfs_space_reservation(fs_info, "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
+ if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
+ btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ }
+ return ret;
+}
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index c3c64019950a..5646393b928c 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
btrfs_try_granting_tickets(fs_info, space_info);
spin_unlock(&space_info->lock);
}
+int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
#endif /* BTRFS_SPACE_INFO_H */
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index 079b059818e9..c46be27be700 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -7,16 +7,6 @@
#include "ctree.h"
-static inline u8 get_unaligned_le8(const void *p)
-{
- return *(u8 *)p;
-}
-
-static inline void put_unaligned_le8(u8 val, void *p)
-{
- *(u8 *)p = val;
-}
-
static bool check_setget_bounds(const struct extent_buffer *eb,
const void *ptr, unsigned off, int size)
{
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 25967ecaaf0a..8840a4fa81eb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1871,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* the filesystem is busy.
*/
cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
btrfs_discard_cleanup(fs_info);
@@ -2163,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
u64 thresh = 0;
int mixed = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(found, &fs_info->space_info, list) {
+ list_for_each_entry(found, &fs_info->space_info, list) {
if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
int i;
@@ -2193,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
total_used += found->disk_used;
}
- rcu_read_unlock();
-
buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
buf->f_blocks >>= bits;
buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index c8df2edafd85..279d9262b676 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -14,6 +14,7 @@
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
+#include "send.h"
#include "transaction.h"
#include "sysfs.h"
#include "volumes.h"
@@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj,
}
BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show);
+static ssize_t send_stream_version_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION);
+}
+BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
+ BTRFS_ATTR_PTR(static_feature, send_stream_version),
NULL
};
@@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj,
BTRFS_ATTR(, checksum, btrfs_checksum_show);
+static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ const char *str;
+
+ switch (READ_ONCE(fs_info->exclusive_operation)) {
+ case BTRFS_EXCLOP_NONE:
+ str = "none\n";
+ break;
+ case BTRFS_EXCLOP_BALANCE:
+ str = "balance\n";
+ break;
+ case BTRFS_EXCLOP_DEV_ADD:
+ str = "device add\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REMOVE:
+ str = "device remove\n";
+ break;
+ case BTRFS_EXCLOP_DEV_REPLACE:
+ str = "device replace\n";
+ break;
+ case BTRFS_EXCLOP_RESIZE:
+ str = "resize\n";
+ break;
+ case BTRFS_EXCLOP_SWAP_ACTIVATE:
+ str = "swap activate\n";
+ break;
+ default:
+ str = "UNKNOWN\n";
+ break;
+ }
+ return scnprintf(buf, PAGE_SIZE, "%s", str);
+}
+BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, quota_override),
BTRFS_ATTR_PTR(, metadata_uuid),
BTRFS_ATTR_PTR(, checksum),
+ BTRFS_ATTR_PTR(, exclusive_operation),
NULL,
};
@@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
+static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list)
+ btrfs_sysfs_remove_device(device);
+ }
+}
+
void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj;
- btrfs_reset_fs_info_ptr(fs_info);
-
sysfs_remove_link(fsid_kobj, "bdi");
if (fs_info->space_info_kobj) {
@@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
addrm_unknown_feature_attrs(fs_info, false);
sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
sysfs_remove_files(fsid_kobj, btrfs_attrs);
- btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_info->fs_devices);
}
static const char * const btrfs_feature_set_names[FEAT_MAX] = {
@@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = {
[FEAT_INCOMPAT] = "incompat",
};
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set)
+const char *btrfs_feature_set_name(enum btrfs_feature_set set)
{
return btrfs_feature_set_names[set];
}
@@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache)
rkobj->flags = cache->flags;
kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+
+ /*
+ * We call this either on mount, or if we've created a block group for a
+ * new index type while running (i.e. when restriping). The running
+ * case is tricky because we could race with other threads, so we need
+ * to have this check to make sure we didn't already init the kobject.
+ *
+ * We don't have to protect on the free side because it only happens on
+ * unmount.
+ */
+ spin_lock(&space_info->lock);
+ if (space_info->block_group_kobjs[index]) {
+ spin_unlock(&space_info->lock);
+ kobject_put(&rkobj->kobj);
+ return;
+ } else {
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+ spin_unlock(&space_info->lock);
+
ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s",
btrfs_bg_type_to_raid_name(rkobj->flags));
memalloc_nofs_restore(nofs_flag);
if (ret) {
+ spin_lock(&space_info->lock);
+ space_info->block_group_kobjs[index] = NULL;
+ spin_unlock(&space_info->lock);
kobject_put(&rkobj->kobj);
btrfs_warn(fs_info,
"failed to add kobject for block cache, ignoring");
return;
}
-
- space_info->block_group_kobjs[index] = &rkobj->kobj;
}
/*
@@ -1151,48 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
return 0;
}
-/* when one_device is NULL, it removes all device links */
-
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+void btrfs_sysfs_remove_device(struct btrfs_device *device)
{
struct hd_struct *disk;
struct kobject *disk_kobj;
+ struct kobject *devices_kobj;
- if (!fs_devices->devices_kobj)
- return -EINVAL;
-
- if (one_device) {
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
-
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ /*
+ * Seed fs_devices devices_kobj aren't used, fetch kobject from the
+ * fs_info::fs_devices.
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ ASSERT(devices_kobj);
- return 0;
+ if (device->bdev) {
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+ sysfs_remove_link(devices_kobj, disk_kobj->name);
}
- list_for_each_entry(one_device, &fs_devices->devices, dev_list) {
-
- if (one_device->bdev) {
- disk = one_device->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
- sysfs_remove_link(fs_devices->devices_kobj,
- disk_kobj->name);
- }
- kobject_del(&one_device->devid_kobj);
- kobject_put(&one_device->devid_kobj);
-
- wait_for_completion(&one_device->kobj_unregister);
+ if (device->devid_kobj.state_initialized) {
+ kobject_del(&device->devid_kobj);
+ kobject_put(&device->devid_kobj);
+ wait_for_completion(&device->kobj_unregister);
}
-
- return 0;
}
static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj,
@@ -1273,44 +1334,80 @@ static struct kobj_type devid_ktype = {
.release = btrfs_release_devid_kobj,
};
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device)
+int btrfs_sysfs_add_device(struct btrfs_device *device)
{
- int error = 0;
- struct btrfs_device *dev;
+ int ret;
unsigned int nofs_flag;
+ struct kobject *devices_kobj;
+ struct kobject *devinfo_kobj;
- nofs_flag = memalloc_nofs_save();
- list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ /*
+ * Make sure we use the fs_info::fs_devices to fetch the kobjects even
+ * for the seed fs_devices
+ */
+ devices_kobj = device->fs_info->fs_devices->devices_kobj;
+ devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj;
+ ASSERT(devices_kobj);
+ ASSERT(devinfo_kobj);
- if (one_device && one_device != dev)
- continue;
+ nofs_flag = memalloc_nofs_save();
- if (dev->bdev) {
- struct hd_struct *disk;
- struct kobject *disk_kobj;
+ if (device->bdev) {
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
- disk = dev->bdev->bd_part;
- disk_kobj = &part_to_dev(disk)->kobj;
+ disk = device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
- error = sysfs_create_link(fs_devices->devices_kobj,
- disk_kobj, disk_kobj->name);
- if (error)
- break;
+ ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name);
+ if (ret) {
+ btrfs_warn(device->fs_info,
+ "creating sysfs device link for devid %llu failed: %d",
+ device->devid, ret);
+ goto out;
}
+ }
- init_completion(&dev->kobj_unregister);
- error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype,
- fs_devices->devinfo_kobj, "%llu",
- dev->devid);
- if (error) {
- kobject_put(&dev->devid_kobj);
- break;
- }
+ init_completion(&device->kobj_unregister);
+ ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype,
+ devinfo_kobj, "%llu", device->devid);
+ if (ret) {
+ kobject_put(&device->devid_kobj);
+ btrfs_warn(device->fs_info,
+ "devinfo init for devid %llu failed: %d",
+ device->devid, ret);
}
+
+out:
memalloc_nofs_restore(nofs_flag);
+ return ret;
+}
- return error;
+static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+ int ret;
+ struct btrfs_device *device;
+ struct btrfs_fs_devices *seed;
+
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+
+ list_for_each_entry(seed, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed->devices, dev_list) {
+ ret = btrfs_sysfs_add_device(device);
+ if (ret)
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ btrfs_sysfs_remove_fs_devices(fs_devices);
+ return ret;
}
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
@@ -1324,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
&disk_to_dev(bdev->bd_disk)->kobj);
}
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid)
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices)
+
{
char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
@@ -1333,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
* Sprouting changes fsid of the mounted filesystem, rename the fsid
* directory
*/
- snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid);
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid);
if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
btrfs_warn(fs_devices->fs_info,
"sysfs: failed to create fsid for sprout");
@@ -1400,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- btrfs_set_fs_info_ptr(fs_info);
-
- error = btrfs_sysfs_add_devices_dir(fs_devs, NULL);
+ error = btrfs_sysfs_add_fs_devices(fs_devs);
if (error)
return error;
error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_sysfs_remove_devices_dir(fs_devs, NULL);
+ btrfs_sysfs_remove_fs_devices(fs_devs);
return error;
}
@@ -1626,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
{
struct btrfs_fs_devices *fs_devs;
struct kobject *fsid_kobj;
- u64 features;
- int ret;
+ u64 __maybe_unused features;
+ int __maybe_unused ret;
if (!fs_info)
return;
+ /*
+ * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
+ * safe when called from some contexts (eg. balance)
+ */
features = get_features(fs_info, set);
ASSERT(bit & supported_feature_masks[set]);
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index cf839c46a131..bacef43f7267 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -13,15 +13,12 @@ enum btrfs_feature_set {
};
char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
-const char * const btrfs_feature_set_name(enum btrfs_feature_set set);
-int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
-int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices,
- struct btrfs_device *one_device);
+const char *btrfs_feature_set_name(enum btrfs_feature_set set);
+int btrfs_sysfs_add_device(struct btrfs_device *device);
+void btrfs_sysfs_remove_device(struct btrfs_device *device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
-void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices,
- const u8 *fsid);
+void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
u64 bit, enum btrfs_feature_set set);
void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index a1b9f9b5978e..df54cdfdc250 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = 0;
- setup_items_for_insert(root, path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, path, &key, &value_len, 1);
item = btrfs_item_nr(0);
write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
value_len);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 894a63a92236..e6719f7db386 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
btrfs_set_file_extent_generation(leaf, fi, 1);
btrfs_set_file_extent_type(leaf, fi, type);
@@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- setup_items_for_insert(root, &path, &key, &value_len, value_len,
- value_len + sizeof(struct btrfs_item), 1);
+ setup_items_for_insert(root, &path, &key, &value_len, 1);
}
/*
@@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
BTRFS_I(inode)->root = root;
- btrfs_test_inode_set_ops(inode);
/* [BTRFS_MAX_EXTENT_SIZE] */
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 20c6ac1a5de7..52ada47aff50 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -292,6 +292,8 @@ loop:
}
cur_trans->fs_info = fs_info;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ init_waitqueue_head(&cur_trans->pending_wait);
atomic_set(&cur_trans->num_writers, 1);
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
@@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
- 0, &eb);
+ 0, &eb, BTRFS_NESTING_COW);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
@@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_root_otransid(new_root_item, trans->transid);
old = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+ ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
+ BTRFS_NESTING_COW);
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
@@ -1636,6 +1639,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
+ pending->snap = NULL;
btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -2164,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
btrfs_wait_delalloc_flush(trans);
+ /*
+ * Wait for all ordered extents started by a fast fsync that joined this
+ * transaction. Otherwise if this transaction commits before the ordered
+ * extents complete we lose logged data after a power failure.
+ */
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
+
btrfs_scrub_pause(fs_info);
/*
* Ok now we need to make sure to block out any other joins while we
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d60b055b8695..858d9153a1cd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -85,6 +85,13 @@ struct btrfs_transaction {
spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * Number of ordered extents the transaction must wait for before
+ * committing. These are ordered extents started by a fast fsync.
+ */
+ atomic_t pending_ordered;
+ wait_queue_head_t pending_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -105,6 +112,7 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH)
#define BTRFS_SEND_TRANS_STUB ((void *)1)
+#define BTRFS_DIO_SYNC_STUB ((void *)2)
struct btrfs_trans_handle {
u64 transid;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 517b44300a05..f0ffd5ee77bd 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -984,7 +984,7 @@ static int check_inode_item(struct extent_buffer *leaf,
/* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */
if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) {
inode_item_err(leaf, slot,
- "invalid inode generation: has %llu expect [0, %llu]",
+ "invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_transid(leaf, iitem), super_gen + 1);
return -EUCLEAN;
}
@@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
int slot)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct btrfs_root_item ri;
+ struct btrfs_root_item ri = { 0 };
const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
BTRFS_ROOT_SUBVOL_DEAD;
int ret;
@@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key,
if (ret < 0)
return ret;
- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
+ if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
+ btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
generic_err(leaf, slot,
- "invalid root item size, have %u expect %zu",
- btrfs_item_size_nr(leaf, slot), sizeof(ri));
+ "invalid root item size, have %u expect %zu or %u",
+ btrfs_item_size_nr(leaf, slot), sizeof(ri),
+ btrfs_legacy_root_item_size());
}
+ /*
+ * For legacy root item, the members starting at generation_v2 will be
+ * all filled with 0.
+ * And since we allow geneartion_v2 as 0, it will still pass the check.
+ */
read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
- sizeof(ri));
+ btrfs_item_size_nr(leaf, slot));
/* Generation related */
if (btrfs_root_generation(&ri) >
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 39da9db35278..56cbc1706b6f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -96,8 +96,6 @@ enum {
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root)
*/
void btrfs_pin_log_trans(struct btrfs_root *root)
{
- mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
- mutex_unlock(&root->log_mutex);
}
/*
@@ -3615,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
* search and this search we'll not find the key again and can just
* bail.
*/
+search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
if (ret != 0)
goto done;
@@ -3634,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
if (min_key.objectid != ino || min_key.type != key_type)
goto done;
+
+ if (need_resched()) {
+ btrfs_release_path(path);
+ cond_resched();
+ goto search;
+ }
+
ret = overwrite_item(trans, log, dst_path, src, i,
&min_key);
if (ret) {
@@ -4082,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_root *log_root,
- const struct extent_map *em)
+ const struct extent_map *em,
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
u64 csum_offset;
u64 csum_len;
+ u64 mod_start = em->mod_start;
+ u64 mod_len = em->mod_len;
LIST_HEAD(ordered_sums);
int ret = 0;
@@ -4094,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
em->block_start == EXTENT_MAP_HOLE)
return 0;
+ list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
+ const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
+ const u64 mod_end = mod_start + mod_len;
+ struct btrfs_ordered_sum *sums;
+
+ if (mod_len == 0)
+ break;
+
+ if (ordered_end <= mod_start)
+ continue;
+ if (mod_end <= ordered->file_offset)
+ break;
+
+ /*
+ * We are going to copy all the csums on this ordered extent, so
+ * go ahead and adjust mod_start and mod_len in case this ordered
+ * extent has already been logged.
+ */
+ if (ordered->file_offset > mod_start) {
+ if (ordered_end >= mod_end)
+ mod_len = ordered->file_offset - mod_start;
+ /*
+ * If we have this case
+ *
+ * |--------- logged extent ---------|
+ * |----- ordered extent ----|
+ *
+ * Just don't mess with mod_start and mod_len, we'll
+ * just end up logging more csums than we need and it
+ * will be ok.
+ */
+ } else {
+ if (ordered_end < mod_end) {
+ mod_len = mod_end - ordered_end;
+ mod_start = ordered_end;
+ } else {
+ mod_len = 0;
+ }
+ }
+
+ /*
+ * To keep us from looping for the above case of an ordered
+ * extent that falls inside of the logged extent.
+ */
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
+ continue;
+
+ list_for_each_entry(sums, &ordered->list, list) {
+ ret = log_csums(trans, inode, log_root, sums);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* We're done, found all csums in the ordered extents. */
+ if (mod_len == 0)
+ return 0;
+
/* If we're compressed we have to save the entire range of csums. */
if (em->compress_type) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
- csum_offset = em->mod_start - em->start;
- csum_len = em->mod_len;
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
}
/* block start is already adjusted for the file extent offset. */
@@ -4140,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
int ret;
int extent_inserted = 0;
- ret = log_extent_csums(trans, inode, log, em);
+ ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
return ret;
@@ -4342,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
- const u64 start,
- const u64 end)
+ struct btrfs_log_ctx *ctx)
{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
struct extent_map *em, *n;
struct list_head extents;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -4359,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
test_gen = root->fs_info->last_trans_committed;
list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
- /*
- * Skip extents outside our logging range. It's important to do
- * it for correctness because if we don't ignore them, we may
- * log them before their ordered extent completes, and therefore
- * we could log them without logging their respective checksums
- * (the checksum items are added to the csum tree at the very
- * end of btrfs_finish_ordered_io()). Also leave such extents
- * outside of our range in the list, since we may have another
- * ranged fsync in the near future that needs them. If an extent
- * outside our range corresponds to a hole, log it to avoid
- * leaving gaps between extents (fsck will complain when we are
- * not using the NO_HOLES feature).
- */
- if ((em->start > end || em->start + em->len <= start) &&
- em->block_start != EXTENT_MAP_HOLE)
- continue;
-
list_del_init(&em->list);
/*
* Just an arbitrary number, this can be really CPU intensive
@@ -4434,8 +4483,32 @@ process:
btrfs_release_path(path);
if (!ret)
ret = btrfs_log_prealloc_extents(trans, inode, path);
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * We have logged all extents successfully, now make sure the commit of
+ * the current transaction waits for the ordered extents to complete
+ * before it commits and wipes out the log trees, otherwise we would
+ * lose data if an ordered extents completes after the transaction
+ * commits and a power failure happens after the transaction commit.
+ */
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
+
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ spin_lock_irq(&inode->ordered_tree.lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&inode->ordered_tree.lock);
+ }
+ btrfs_put_ordered_extent(ordered);
+ }
+
+ return 0;
}
static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
@@ -4841,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
ret = btrfs_log_inode(trans, root,
BTRFS_I(inode),
LOG_OTHER_INODE_ALL,
- 0, LLONG_MAX, ctx);
+ ctx);
btrfs_add_delayed_iput(inode);
}
}
@@ -4883,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * it has the full sync bit set (see btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -4899,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* log with the new name before we unpin it.
*/
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
+ LOG_OTHER_INODE, ctx);
if (ret) {
btrfs_add_delayed_iput(inode);
continue;
@@ -5112,8 +5185,6 @@ next_key:
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
int inode_only,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct btrfs_path *path;
@@ -5292,7 +5363,7 @@ log_extents:
}
if (fast_search) {
ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
- ctx, start, end);
+ ctx);
if (ret) {
err = ret;
goto out_unlock;
@@ -5301,31 +5372,8 @@ log_extents:
struct extent_map *em, *n;
write_lock(&em_tree->lock);
- /*
- * We can't just remove every em if we're called for a ranged
- * fsync - that is, one that doesn't cover the whole possible
- * file range (0 to LLONG_MAX). This is because we can have
- * em's that fall outside the range we're logging and therefore
- * their ordered operations haven't completed yet
- * (btrfs_finish_ordered_io() not invoked yet). This means we
- * didn't get their respective file extent item in the fs/subvol
- * tree yet, and need to let the next fast fsync (one which
- * consults the list of modified extent maps) find the em so
- * that it logs a matching file extent item and waits for the
- * respective ordered operation to complete (if it's still
- * running).
- *
- * Removing every em outside the range we're logging would make
- * the next fast fsync not log their matching file extent items,
- * therefore making us lose data after a log replay.
- */
- list_for_each_entry_safe(em, n, &em_tree->modified_extents,
- list) {
- const u64 mod_end = em->mod_start + em->mod_len - 1;
-
- if (em->mod_start >= start && mod_end <= end)
- list_del_init(&em->list);
- }
+ list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
+ list_del_init(&em->list);
write_unlock(&em_tree->lock);
}
@@ -5339,19 +5387,34 @@ log_extents:
}
/*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
+ * If we are logging that an ancestor inode exists as part of logging a
+ * new name from a link or rename operation, don't mark the inode as
+ * logged - otherwise if an explicit fsync is made against an ancestor,
+ * the fsync considers the inode in the log and doesn't sync the log,
+ * resulting in the ancestor missing after a power failure unless the
+ * log was synced as part of an fsync against any other unrelated inode.
+ * So keep it simple for this case and just don't flag the ancestors as
+ * logged.
*/
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
+ if (!ctx ||
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+ &inode->vfs_inode != ctx->inode)) {
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists
+ * after it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode,
+ * then the inode gets evicted after all delalloc was flushed,
+ * then we log it exists (due to a rename for example) and then
+ * fsync it. This last fsync would do nothing (not logging the
+ * extents previously written).
+ */
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5591,7 +5654,7 @@ process_leaf:
if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
- log_mode, 0, LLONG_MAX, ctx);
+ log_mode, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
@@ -5735,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (ctx)
ctx->log_new_dentries = false;
ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
- LOG_INODE_ALL, 0, LLONG_MAX, ctx);
+ LOG_INODE_ALL, ctx);
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
ret = 1;
@@ -5786,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (BTRFS_I(inode)->generation > last_committed)
ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
- LOG_INODE_EXISTS,
- 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -5842,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
if (inode->generation > fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode,
- LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
+ LOG_INODE_EXISTS, ctx);
if (ret)
break;
}
@@ -5950,8 +6012,6 @@ out:
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct dentry *parent,
- const loff_t start,
- const loff_t end,
int inode_only,
struct btrfs_log_ctx *ctx)
{
@@ -6004,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
+ ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
if (ret)
goto end_trans;
@@ -6100,15 +6160,13 @@ end_no_trans:
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
- start, end, LOG_INODE_ALL, ctx);
+ LOG_INODE_ALL, ctx);
dput(parent);
return ret;
@@ -6371,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
- *
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
- *
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+ struct dentry *parent)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- int ret;
+ struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
@@ -6405,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
-
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ return;
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 132e43d29034..731bd9c029f5 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -16,8 +16,11 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
+ bool logging_new_name;
struct inode *inode;
struct list_head list;
+ /* Only used for fast fsyncs. */
+ struct list_head ordered_extents;
};
static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
@@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
+ INIT_LIST_HEAD(&ctx->ordered_extents);
+}
+
+static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_ordered_extent *ordered;
+ struct btrfs_ordered_extent *tmp;
+
+ ASSERT(inode_is_locked(ctx->inode));
+
+ list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
}
static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans)
@@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct dentry *dentry,
- const loff_t start,
- const loff_t end,
struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
-/* Return values for btrfs_log_new_name() */
-enum {
- BTRFS_DONT_NEED_TRANS_COMMIT,
- BTRFS_NEED_TRANS_COMMIT,
- BTRFS_DONT_NEED_LOG_SYNC,
- BTRFS_NEED_LOG_SYNC,
-};
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx);
+ struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ee96c5869f57..58b9c419a2b6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
@@ -290,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* balance_mutex
*
*
- * Exclusive operations, BTRFS_FS_EXCL_OP
- * ======================================
+ * Exclusive operations
+ * ====================
*
* Maintains the exclusivity of the following operations that apply to the
* whole filesystem and cannot run in parallel.
@@ -317,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* - system power-cycle and filesystem mounted as read-only
* - filesystem or device errors leading to forced read-only
*
- * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
- * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
+ * The status of exclusive operation is set and cleared atomically.
+ * During the course of Paused state, fs_info::exclusive_operation remains set.
* A device operation in Paused or Running state can be canceled or resumed
* either by ioctl (Balance only) or when remounted as read-write.
- * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
+ * The exclusive status is cleared when the device operation is canceled or
* completed.
*/
@@ -355,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
INIT_LIST_HEAD(&fs_devs->devices);
INIT_LIST_HEAD(&fs_devs->alloc_list);
INIT_LIST_HEAD(&fs_devs->fs_list);
+ INIT_LIST_HEAD(&fs_devs->seed_list);
if (fsid)
memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -405,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void)
* Returned struct is not linked onto any lists and must be destroyed using
* btrfs_free_device.
*/
-static struct btrfs_device *__alloc_device(void)
+static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
{
struct btrfs_device *dev;
@@ -432,7 +434,8 @@ static struct btrfs_device *__alloc_device(void)
btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
+ extent_io_tree_init(fs_info, &dev->alloc_state,
+ IO_TREE_DEVICE_ALLOC_STATE, NULL);
return dev;
}
@@ -592,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path,
btrfs_free_device(device);
ret = 0;
- if (fs_devices->num_devices == 0)
- break;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -940,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path,
bdput(path_bdev);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_warn_in_rcu(device->fs_info,
- "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
+ path, devid, found_transid,
+ current->comm,
+ task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
bdput(path_bdev);
btrfs_info_in_rcu(device->fs_info,
- "device fsid %pU devid %llu moved old:%s new:%s",
- disk_super->fsid, devid,
- rcu_str_deref(device->name), path);
+ "devid %llu device path %s changed to %s scanned by %s (%d)",
+ devid, rcu_str_deref(device->name),
+ path, current->comm,
+ task_pid_nr(current));
}
name = rcu_string_strdup(path, GFP_NOFS);
@@ -1034,28 +1037,21 @@ error:
return ERR_PTR(ret);
}
-/*
- * After we have read the system tree and know devids belonging to
- * this filesystem, remove the device which does not belong there.
- */
-void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
+ int step, struct btrfs_device **latest_dev)
{
struct btrfs_device *device, *next;
- struct btrfs_device *latest_dev = NULL;
- mutex_lock(&uuid_mutex);
-again:
/* This is the initialized path, it is safe to release the devices. */
list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
- &device->dev_state)) {
+ if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
- &device->dev_state) &&
+ &device->dev_state) &&
!test_bit(BTRFS_DEV_STATE_MISSING,
&device->dev_state) &&
- (!latest_dev ||
- device->generation > latest_dev->generation)) {
- latest_dev = device;
+ (!*latest_dev ||
+ device->generation > (*latest_dev)->generation)) {
+ *latest_dev = device;
}
continue;
}
@@ -1093,10 +1089,22 @@ again:
btrfs_free_device(device);
}
- if (fs_devices->seed) {
- fs_devices = fs_devices->seed;
- goto again;
- }
+}
+
+/*
+ * After we have read the system tree and know devids belonging to this
+ * filesystem, remove the device which does not belong there.
+ */
+void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
+{
+ struct btrfs_device *latest_dev = NULL;
+ struct btrfs_fs_devices *seed_dev;
+
+ mutex_lock(&uuid_mutex);
+ __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
+
+ list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
+ __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
fs_devices->latest_bdev = latest_dev->bdev;
@@ -1148,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device)
ASSERT(atomic_read(&device->reada_in_flight) == 0);
}
-static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
+static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
+ lockdep_assert_held(&uuid_mutex);
+
if (--fs_devices->opened > 0)
- return 0;
+ return;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
+ list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
btrfs_close_one_device(device);
- }
- mutex_unlock(&fs_devices->device_list_mutex);
WARN_ON(fs_devices->open_devices);
WARN_ON(fs_devices->rw_devices);
fs_devices->opened = 0;
fs_devices->seeding = false;
-
- return 0;
+ fs_devices->fs_info = NULL;
}
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_devices *seed_devices = NULL;
- int ret;
+ LIST_HEAD(list);
+ struct btrfs_fs_devices *tmp;
mutex_lock(&uuid_mutex);
- ret = close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
- seed_devices = fs_devices->seed;
- fs_devices->seed = NULL;
- }
- mutex_unlock(&uuid_mutex);
+ close_fs_devices(fs_devices);
+ if (!fs_devices->opened)
+ list_splice_init(&fs_devices->seed_list, &list);
- while (seed_devices) {
- fs_devices = seed_devices;
- seed_devices = fs_devices->seed;
+ list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
close_fs_devices(fs_devices);
+ list_del(&fs_devices->seed_list);
free_fs_devices(fs_devices);
}
- return ret;
+ mutex_unlock(&uuid_mutex);
}
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
@@ -1196,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
{
struct btrfs_device *device;
struct btrfs_device *latest_dev = NULL;
+ struct btrfs_device *tmp_device;
flags |= FMODE_EXCL;
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
- /* Just open everything we can; ignore failures here */
- if (btrfs_open_one_device(fs_devices, device, flags, holder))
- continue;
+ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
+ dev_list) {
+ int ret;
- if (!latest_dev ||
- device->generation > latest_dev->generation)
+ ret = btrfs_open_one_device(fs_devices, device, flags, holder);
+ if (ret == 0 &&
+ (!latest_dev || device->generation > latest_dev->generation)) {
latest_dev = device;
+ } else if (ret == -ENODATA) {
+ fs_devices->num_devices--;
+ list_del(&device->dev_list);
+ btrfs_free_device(device);
+ }
}
if (fs_devices->open_devices == 0)
return -EINVAL;
@@ -1960,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device(
* this_dev) which is active.
*/
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
- struct btrfs_device *this_dev)
+ struct btrfs_device *next_device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
- struct btrfs_device *next_device;
- if (this_dev)
- next_device = this_dev;
- else
+ if (!next_device)
next_device = btrfs_find_next_active_device(fs_info->fs_devices,
- device);
+ device);
ASSERT(next_device);
if (fs_info->sb->s_bdev &&
@@ -1998,9 +2003,9 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
-static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
- struct block_device *bdev,
- const char *device_path)
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path)
{
struct btrfs_super_block *disk_super;
int copy_num;
@@ -2039,7 +2044,7 @@ static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
- u64 devid)
+ u64 devid)
{
struct btrfs_device *device;
struct btrfs_fs_devices *cur_devices;
@@ -2143,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (device->bdev) {
cur_devices->open_devices--;
/* remove sysfs entry */
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
}
num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
@@ -2164,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
btrfs_free_device(device);
if (cur_devices->open_devices == 0) {
- while (fs_devices) {
- if (fs_devices->seed == cur_devices) {
- fs_devices->seed = cur_devices->seed;
- break;
- }
- fs_devices = fs_devices->seed;
- }
- cur_devices->seed = NULL;
+ list_del_init(&cur_devices->seed_list);
close_fs_devices(cur_devices);
free_fs_devices(cur_devices);
}
@@ -2220,14 +2218,9 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
{
- struct btrfs_fs_info *fs_info = srcdev->fs_info;
struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
- if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
- /* zero out the old super if it is writable */
- btrfs_scratch_superblocks(fs_info, srcdev->bdev,
- srcdev->name->str);
- }
+ mutex_lock(&uuid_mutex);
btrfs_close_bdev(srcdev);
synchronize_rcu();
@@ -2235,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
/* if this is no devs we rather delete the fs_devices */
if (!fs_devices->num_devices) {
- struct btrfs_fs_devices *tmp_fs_devices;
-
/*
* On a mounted FS, num_devices can't be zero unless it's a
* seed. In case of a seed device being replaced, the replace
@@ -2245,18 +2236,11 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
*/
ASSERT(fs_devices->seeding);
- tmp_fs_devices = fs_info->fs_devices;
- while (tmp_fs_devices) {
- if (tmp_fs_devices->seed == fs_devices) {
- tmp_fs_devices->seed = fs_devices->seed;
- break;
- }
- tmp_fs_devices = tmp_fs_devices->seed;
- }
- fs_devices->seed = NULL;
+ list_del_init(&fs_devices->seed_list);
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
}
+ mutex_unlock(&uuid_mutex);
}
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
@@ -2265,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
mutex_lock(&fs_devices->device_list_mutex);
- btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);
+ btrfs_sysfs_remove_device(tgtdev);
if (tgtdev->bdev)
fs_devices->open_devices--;
@@ -2374,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
if (!fs_devices->seeding)
return -EINVAL;
+ /*
+ * Private copy of the seed devices, anchored at
+ * fs_info->fs_devices->seed_list
+ */
seed_devices = alloc_fs_devices(NULL, NULL);
if (IS_ERR(seed_devices))
return PTR_ERR(seed_devices);
+ /*
+ * It's necessary to retain a copy of the original seed fs_devices in
+ * fs_uuids so that filesystems which have been seeded can successfully
+ * reference the seed device from open_seed_devices. This also supports
+ * multiple fs seed.
+ */
old_devices = clone_fs_devices(fs_devices);
if (IS_ERR(old_devices)) {
kfree(seed_devices);
@@ -2398,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
list_for_each_entry(device, &seed_devices->devices, dev_list)
device->fs_devices = seed_devices;
- mutex_lock(&fs_info->chunk_mutex);
- list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
- mutex_unlock(&fs_info->chunk_mutex);
-
fs_devices->seeding = false;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
fs_devices->missing_devices = 0;
fs_devices->rotating = false;
- fs_devices->seed = seed_devices;
+ list_add(&seed_devices->seed_list, &fs_devices->seed_list);
generate_random_uuid(fs_devices->fsid);
memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
@@ -2510,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
u64 orig_super_num_devices;
int seeding_dev = 0;
int ret = 0;
- bool unlocked = false;
+ bool locked = false;
if (sb_rdonly(sb) && !fs_devices->seeding)
return -EROFS;
@@ -2524,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
seeding_dev = 1;
down_write(&sb->s_umount);
mutex_lock(&uuid_mutex);
+ locked = true;
}
- filemap_write_and_wait(bdev->bd_inode->i_mapping);
+ sync_blockdev(bdev);
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->bdev == bdev) {
ret = -EEXIST;
- mutex_unlock(
- &fs_devices->device_list_mutex);
+ rcu_read_unlock();
goto error;
}
}
- mutex_unlock(&fs_devices->device_list_mutex);
+ rcu_read_unlock();
device = btrfs_alloc_device(fs_info, NULL, NULL);
if (IS_ERR(device)) {
@@ -2612,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_set_super_num_devices(fs_info->super_copy,
orig_super_num_devices + 1);
- /* add sysfs device entry */
- btrfs_sysfs_add_devices_dir(fs_devices, device);
-
/*
* we've got more storage, clear any full flags on the space
* infos
@@ -2622,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
btrfs_clear_space_info_full(fs_info);
mutex_unlock(&fs_info->chunk_mutex);
+
+ /* Add sysfs device entry */
+ btrfs_sysfs_add_device(device);
+
mutex_unlock(&fs_devices->device_list_mutex);
if (seeding_dev) {
@@ -2647,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
goto error_sysfs;
}
- btrfs_sysfs_update_sprout_fsid(fs_devices,
- fs_info->fs_devices->fsid);
+ /*
+ * fs_devices now represents the newly sprouted filesystem and
+ * its fsid has been changed by btrfs_prepare_sprout
+ */
+ btrfs_sysfs_update_sprout_fsid(fs_devices);
}
ret = btrfs_commit_transaction(trans);
@@ -2656,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
if (seeding_dev) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
- unlocked = true;
+ locked = false;
if (ret) /* transaction commit */
return ret;
@@ -2691,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return ret;
error_sysfs:
- btrfs_sysfs_remove_devices_dir(fs_devices, device);
+ btrfs_sysfs_remove_device(device);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_info->chunk_mutex);
list_del_rcu(&device->dev_list);
@@ -2717,7 +2711,7 @@ error_free_device:
btrfs_free_device(device);
error:
blkdev_put(bdev, FMODE_EXCL);
- if (seeding_dev && !unlocked) {
+ if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
}
@@ -4044,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
/*
* rw_devices will not change at the moment, device add/delete/replace
- * are excluded by EXCL_OP
+ * are exclusive
*/
num_devices = fs_info->fs_devices->rw_devices;
@@ -4180,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
balance_need_close(fs_info)) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
}
wake_up(&fs_info->balance_wait_q);
@@ -4191,7 +4185,7 @@ out:
reset_balance_state(fs_info);
else
kfree(bctl);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
return ret;
}
@@ -4293,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
* is in a paused state and must have fs_info::balance_ctl properly
* set up.
*/
- if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
btrfs_warn(fs_info,
"balance: cannot set exclusive op status, resume manually");
@@ -4375,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
if (fs_info->balance_ctl) {
reset_balance_state(fs_info);
- clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
+ btrfs_exclop_finish(fs_info);
btrfs_info(fs_info, "balance: canceled");
}
}
@@ -4462,6 +4456,7 @@ int btrfs_uuid_scan_kthread(void *data)
goto skip;
}
update_tree:
+ btrfs_release_path(path);
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
@@ -4486,6 +4481,7 @@ update_tree:
}
skip:
+ btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
trans = NULL;
@@ -4493,7 +4489,6 @@ skip:
break;
}
- btrfs_release_path(path);
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
@@ -6459,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
bool seed)
{
struct btrfs_device *device;
+ struct btrfs_fs_devices *seed_devs;
+
+ if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
+ if (device->devid == devid &&
+ (!uuid || memcmp(device->uuid, uuid,
+ BTRFS_UUID_SIZE) == 0))
+ return device;
+ }
+ }
- while (fs_devices) {
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
if (!fsid ||
- !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
- list_for_each_entry(device, &fs_devices->devices,
+ !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
+ list_for_each_entry(device, &seed_devs->devices,
dev_list) {
if (device->devid == devid &&
(!uuid || memcmp(device->uuid, uuid,
@@ -6471,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
return device;
}
}
- if (seed)
- fs_devices = fs_devices->seed;
- else
- return NULL;
}
+
return NULL;
}
@@ -6483,8 +6485,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
+ unsigned int nofs_flag;
+ /*
+ * We call this under the chunk_mutex, so we want to use NOFS for this
+ * allocation, however we don't want to change btrfs_alloc_device() to
+ * always do NOFS because we use it in a lot of other GFP_KERNEL safe
+ * places.
+ */
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -6521,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- dev = __alloc_device();
+ dev = __alloc_device(fs_info);
if (IS_ERR(dev))
return dev;
@@ -6717,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&uuid_mutex);
ASSERT(fsid);
- fs_devices = fs_info->fs_devices->seed;
- while (fs_devices) {
+ /* This will match only for multi-device seed fs */
+ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
return fs_devices;
- fs_devices = fs_devices->seed;
- }
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
@@ -6739,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
return fs_devices;
}
+ /*
+ * Upon first call for a seed fs fsid, just create a private copy of the
+ * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
+ */
fs_devices = clone_fs_devices(fs_devices);
if (IS_ERR(fs_devices))
return fs_devices;
@@ -6746,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
if (ret) {
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(ret);
- goto out;
+ return ERR_PTR(ret);
}
if (!fs_devices->seeding) {
close_fs_devices(fs_devices);
free_fs_devices(fs_devices);
- fs_devices = ERR_PTR(-EINVAL);
- goto out;
+ return ERR_PTR(-EINVAL);
}
- fs_devices->seed = fs_info->fs_devices->seed;
- fs_info->fs_devices->seed = fs_devices;
-out:
+ list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
+
return fs_devices;
}
@@ -7178,17 +7188,22 @@ error:
void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
- while (fs_devices) {
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
+ fs_devices->fs_info = fs_info;
+
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->fs_info = fs_info;
+
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list)
device->fs_info = fs_info;
- mutex_unlock(&fs_devices->device_list_mutex);
- fs_devices = fs_devices->seed;
+ seed_devs->fs_info = fs_info;
}
+ mutex_unlock(&fs_devices->device_list_mutex);
}
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
@@ -7214,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
sizeof(val));
}
-int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+static int btrfs_device_init_dev_stats(struct btrfs_device *device,
+ struct btrfs_path *path)
{
- struct btrfs_key key;
- struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_dev_stats_item *ptr;
struct extent_buffer *eb;
- int slot;
- int ret = 0;
+ struct btrfs_key key;
+ int item_size;
+ int i, ret, slot;
+
+ key.objectid = BTRFS_DEV_STATS_OBJECTID;
+ key.type = BTRFS_PERSISTENT_ITEM_KEY;
+ key.offset = device->devid;
+ ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
+ if (ret) {
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+ btrfs_dev_stat_set(device, i, 0);
+ device->dev_stats_valid = 1;
+ btrfs_release_path(path);
+ return ret < 0 ? ret : 0;
+ }
+ slot = path->slots[0];
+ eb = path->nodes[0];
+ item_size = btrfs_item_size_nr(eb, slot);
+
+ ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
+
+ for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+ if (item_size >= (1 + i) * sizeof(__le64))
+ btrfs_dev_stat_set(device, i,
+ btrfs_dev_stats_value(eb, ptr, i));
+ else
+ btrfs_dev_stat_set(device, i, 0);
+ }
+
+ device->dev_stats_valid = 1;
+ btrfs_dev_stat_print_on_load(device);
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
struct btrfs_device *device;
struct btrfs_path *path = NULL;
- int i;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
@@ -7232,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
- int item_size;
- struct btrfs_dev_stats_item *ptr;
-
- key.objectid = BTRFS_DEV_STATS_OBJECTID;
- key.type = BTRFS_PERSISTENT_ITEM_KEY;
- key.offset = device->devid;
- ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
- if (ret) {
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
- btrfs_dev_stat_set(device, i, 0);
- device->dev_stats_valid = 1;
- btrfs_release_path(path);
- continue;
- }
- slot = path->slots[0];
- eb = path->nodes[0];
- item_size = btrfs_item_size_nr(eb, slot);
-
- ptr = btrfs_item_ptr(eb, slot,
- struct btrfs_dev_stats_item);
-
- for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
- if (item_size >= (1 + i) * sizeof(__le64))
- btrfs_dev_stat_set(device, i,
- btrfs_dev_stats_value(eb, ptr, i));
- else
- btrfs_dev_stat_set(device, i, 0);
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
+ }
+ list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
+ list_for_each_entry(device, &seed_devs->devices, dev_list) {
+ ret = btrfs_device_init_dev_stats(device, path);
+ if (ret)
+ goto out;
}
-
- device->dev_stats_valid = 1;
- btrfs_dev_stat_print_on_load(device);
- btrfs_release_path(path);
}
+out:
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_free_path(path);
- return ret < 0 ? ret : 0;
+ return ret;
}
static int update_dev_stat_item(struct btrfs_trans_handle *trans,
@@ -7485,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
mutex_unlock(&trans->fs_info->chunk_mutex);
}
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = fs_info;
- fs_devices = fs_devices->seed;
- }
-}
-
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- while (fs_devices) {
- fs_devices->fs_info = NULL;
- fs_devices = fs_devices->seed;
- }
-}
-
/*
* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
*/
@@ -7583,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
/* It's possible this device is a dummy for seed device */
if (dev->disk_total_bytes == 0) {
- dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
- NULL, false);
+ struct btrfs_fs_devices *devs;
+
+ devs = list_first_entry(&fs_info->fs_devices->seed_list,
+ struct btrfs_fs_devices, seed_list);
+ dev = btrfs_find_device(devs, devid, NULL, NULL, false);
if (!dev) {
btrfs_err(fs_info, "failed to find seed devid %llu",
devid);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5eea93916fbf..bf27ac07d315 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -58,7 +58,7 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string *name;
+ struct rcu_string __rcu *name;
u64 generation;
@@ -246,7 +246,7 @@ struct btrfs_fs_devices {
*/
struct list_head alloc_list;
- struct btrfs_fs_devices *seed;
+ struct list_head seed_list;
bool seeding;
int opened;
@@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *btrfs_scan_one_device(const char *path,
fmode_t flags, void *holder);
int btrfs_forget_devices(const char *path);
-int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
@@ -569,10 +569,11 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags)
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void);
-void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev);
+void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev,
+ const char *device_path);
int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 3989d08396ac..1f75b25e559a 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1017,6 +1017,8 @@ handle_mnt_opt:
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, true,
full_path, fid);
+ if (rc == -EREMOTE)
+ rc = 0;
if (rc) {
cifs_dbg(FYI, "%s: Get mode from SID failed. rc=%d\n",
__func__, rc);
@@ -1025,6 +1027,8 @@ handle_mnt_opt:
} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
rc = cifs_acl_to_fattr(cifs_sb, &fattr, *inode, false,
full_path, fid);
+ if (rc == -EREMOTE)
+ rc = 0;
if (rc) {
cifs_dbg(FYI, "%s: Getting ACL failed with error: %d\n",
__func__, rc);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 32f90dc82c84..d44df8f95bcd 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1208,7 +1208,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
rqst[1].rq_iov = si_iov;
rqst[1].rq_nvec = 1;
- len = sizeof(ea) + ea_name_len + ea_value_len + 1;
+ len = sizeof(*ea) + ea_name_len + ea_value_len + 1;
ea = kzalloc(len, GFP_KERNEL);
if (ea == NULL) {
rc = -ENOMEM;
diff --git a/fs/compat.c b/fs/compat.c
deleted file mode 100644
index 436d228cf71c..000000000000
--- a/fs/compat.c
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/compat.c
- *
- * Kernel compatibililty routines for e.g. 32 bit syscall support
- * on 64 bit kernels.
- *
- * Copyright (C) 2002 Stephen Rothwell, IBM Corporation
- * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com)
- * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
- * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs
- * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz)
- */
-
-#include <linux/compat.h>
-#include <linux/nfs4_mount.h>
-#include <linux/syscalls.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include "internal.h"
-
-struct compat_nfs_string {
- compat_uint_t len;
- compat_uptr_t data;
-};
-
-static inline void compat_nfs_string(struct nfs_string *dst,
- struct compat_nfs_string *src)
-{
- dst->data = compat_ptr(src->data);
- dst->len = src->len;
-}
-
-struct compat_nfs4_mount_data_v1 {
- compat_int_t version;
- compat_int_t flags;
- compat_int_t rsize;
- compat_int_t wsize;
- compat_int_t timeo;
- compat_int_t retrans;
- compat_int_t acregmin;
- compat_int_t acregmax;
- compat_int_t acdirmin;
- compat_int_t acdirmax;
- struct compat_nfs_string client_addr;
- struct compat_nfs_string mnt_path;
- struct compat_nfs_string hostname;
- compat_uint_t host_addrlen;
- compat_uptr_t host_addr;
- compat_int_t proto;
- compat_int_t auth_flavourlen;
- compat_uptr_t auth_flavours;
-};
-
-static int do_nfs4_super_data_conv(void *raw_data)
-{
- int version = *(compat_uint_t *) raw_data;
-
- if (version == 1) {
- struct compat_nfs4_mount_data_v1 *raw = raw_data;
- struct nfs4_mount_data *real = raw_data;
-
- /* copy the fields backwards */
- real->auth_flavours = compat_ptr(raw->auth_flavours);
- real->auth_flavourlen = raw->auth_flavourlen;
- real->proto = raw->proto;
- real->host_addr = compat_ptr(raw->host_addr);
- real->host_addrlen = raw->host_addrlen;
- compat_nfs_string(&real->hostname, &raw->hostname);
- compat_nfs_string(&real->mnt_path, &raw->mnt_path);
- compat_nfs_string(&real->client_addr, &raw->client_addr);
- real->acdirmax = raw->acdirmax;
- real->acdirmin = raw->acdirmin;
- real->acregmax = raw->acregmax;
- real->acregmin = raw->acregmin;
- real->retrans = raw->retrans;
- real->timeo = raw->timeo;
- real->wsize = raw->wsize;
- real->rsize = raw->rsize;
- real->flags = raw->flags;
- real->version = raw->version;
- }
-
- return 0;
-}
-
-#define NFS4_NAME "nfs4"
-
-COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
- const char __user *, dir_name,
- const char __user *, type, compat_ulong_t, flags,
- const void __user *, data)
-{
- char *kernel_type;
- void *options;
- char *kernel_dev;
- int retval;
-
- kernel_type = copy_mount_string(type);
- retval = PTR_ERR(kernel_type);
- if (IS_ERR(kernel_type))
- goto out;
-
- kernel_dev = copy_mount_string(dev_name);
- retval = PTR_ERR(kernel_dev);
- if (IS_ERR(kernel_dev))
- goto out1;
-
- options = copy_mount_options(data);
- retval = PTR_ERR(options);
- if (IS_ERR(options))
- goto out2;
-
- if (kernel_type && options) {
- if (!strcmp(kernel_type, NFS4_NAME)) {
- retval = -EINVAL;
- if (do_nfs4_super_data_conv(options))
- goto out3;
- }
- }
-
- retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
-
- out3:
- kfree(options);
- out2:
- kfree(kernel_dev);
- out1:
- kfree(kernel_type);
- out:
- return retval;
-}
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 9212325763b0..4ef3f714046a 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -343,9 +343,11 @@ void fscrypt_msg(const struct inode *inode, const char *level,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- if (inode)
+ if (inode && inode->i_ino)
printk("%sfscrypt (%s, inode %lu): %pV\n",
level, inode->i_sb->s_id, inode->i_ino, &vaf);
+ else if (inode)
+ printk("%sfscrypt (%s): %pV\n", level, inode->i_sb->s_id, &vaf);
else
printk("%sfscrypt: %pV\n", level, &vaf);
va_end(args);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 011830f84d8d..1fbe6c24d705 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -61,15 +61,6 @@ struct fscrypt_nokey_name {
*/
#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256)
-static void fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result)
-{
- struct sha256_state sctx;
-
- sha256_init(&sctx);
- sha256_update(&sctx, data, data_len);
- sha256_final(&sctx, result);
-}
-
static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
{
if (str->len == 1 && str->name[0] == '.')
@@ -242,11 +233,11 @@ static int base64_decode(const char *src, int len, u8 *dst)
return cp - dst;
}
-bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
- u32 max_len, u32 *encrypted_len_ret)
+bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret)
{
- const struct fscrypt_info *ci = inode->i_crypt_info;
- int padding = 4 << (fscrypt_policy_flags(&ci->ci_policy) &
+ int padding = 4 << (fscrypt_policy_flags(policy) &
FSCRYPT_POLICY_FLAGS_PAD_MASK);
u32 encrypted_len;
@@ -260,8 +251,6 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
/**
* fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames
- * @inode: inode of the parent directory (for regular filenames)
- * or of the symlink (for symlink targets)
* @max_encrypted_len: maximum length of encrypted filenames the buffer will be
* used to present
* @crypto_str: (output) buffer to allocate
@@ -271,8 +260,7 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_fname_alloc_buffer(const struct inode *inode,
- u32 max_encrypted_len,
+int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
struct fscrypt_str *crypto_str)
{
const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX);
@@ -369,9 +357,9 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode,
} else {
memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes));
/* Compute strong hash of remaining part of name. */
- fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)],
- iname->len - sizeof(nokey_name.bytes),
- nokey_name.sha256);
+ sha256(&iname->name[sizeof(nokey_name.bytes)],
+ iname->len - sizeof(nokey_name.bytes),
+ nokey_name.sha256);
size = FSCRYPT_NOKEY_NAME_MAX;
}
oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name);
@@ -394,9 +382,9 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
* directory's encryption key, then @iname is the plaintext, so we encrypt it to
* get the disk_name.
*
- * Else, for keyless @lookup operations, @iname is the presented ciphertext, so
- * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be
- * impossible in this case, so we fail them with ENOKEY.
+ * Else, for keyless @lookup operations, @iname should be a no-key name, so we
+ * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will
+ * be impossible in this case, so we fail them with ENOKEY.
*
* If successful, fscrypt_free_filename() must be called later to clean up.
*
@@ -421,7 +409,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
return ret;
if (fscrypt_has_encryption_key(dir)) {
- if (!fscrypt_fname_encrypted_size(dir, iname->len,
+ if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy,
+ iname->len,
dir->i_sb->s_cop->max_namelen,
&fname->crypto_buf.len))
return -ENAMETOOLONG;
@@ -440,7 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
}
if (!lookup)
return -ENOKEY;
- fname->is_ciphertext_name = true;
+ fname->is_nokey_name = true;
/*
* We don't have the key and we are doing a lookup; decode the
@@ -499,7 +488,7 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
{
const struct fscrypt_nokey_name *nokey_name =
(const void *)fname->crypto_buf.name;
- u8 sha256[SHA256_DIGEST_SIZE];
+ u8 digest[SHA256_DIGEST_SIZE];
if (likely(fname->disk_name.name)) {
if (de_name_len != fname->disk_name.len)
@@ -510,9 +499,9 @@ bool fscrypt_match_name(const struct fscrypt_name *fname,
return false;
if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes)))
return false;
- fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)],
- de_name_len - sizeof(nokey_name->bytes), sha256);
- return !memcmp(sha256, nokey_name->sha256, sizeof(sha256));
+ sha256(&de_name[sizeof(nokey_name->bytes)],
+ de_name_len - sizeof(nokey_name->bytes), digest);
+ return !memcmp(digest, nokey_name->sha256, sizeof(digest));
}
EXPORT_SYMBOL_GPL(fscrypt_match_name);
@@ -541,7 +530,7 @@ EXPORT_SYMBOL_GPL(fscrypt_fname_siphash);
* Validate dentries in encrypted directories to make sure we aren't potentially
* caching stale dentries after a key has been added.
*/
-static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct dentry *dir;
int err;
@@ -549,17 +538,17 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
/*
* Plaintext names are always valid, since fscrypt doesn't support
- * reverting to ciphertext names without evicting the directory's inode
+ * reverting to no-key names without evicting the directory's inode
* -- which implies eviction of the dentries in the directory.
*/
- if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME))
+ if (!(dentry->d_flags & DCACHE_NOKEY_NAME))
return 1;
/*
- * Ciphertext name; valid if the directory's key is still unavailable.
+ * No-key name; valid if the directory's key is still unavailable.
*
- * Although fscrypt forbids rename() on ciphertext names, we still must
- * use dget_parent() here rather than use ->d_parent directly. That's
+ * Although fscrypt forbids rename() on no-key names, we still must use
+ * dget_parent() here rather than use ->d_parent directly. That's
* because a corrupted fs image may contain directory hard links, which
* the VFS handles by moving the directory's dentry tree in the dcache
* each time ->lookup() finds the directory and it already has a dentry
@@ -580,6 +569,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
return valid;
}
+EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
const struct dentry_operations fscrypt_d_ops = {
.d_revalidate = fscrypt_d_revalidate,
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 8117a61b6f55..4f5806a3b73d 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -97,7 +97,6 @@ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx)
return NULL;
}
-#undef fscrypt_policy
union fscrypt_policy {
u8 version;
struct fscrypt_policy_v1 v1;
@@ -292,8 +291,9 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num,
/* fname.c */
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen);
-bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
- u32 max_len, u32 *encrypted_len_ret);
+bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
+ u32 orig_len, u32 max_len,
+ u32 *encrypted_len_ret);
extern const struct dentry_operations fscrypt_d_ops;
/* hkdf.c */
@@ -572,6 +572,9 @@ int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key);
int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
const struct fscrypt_master_key *mk);
+void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk);
+
/* keysetup_v1.c */
void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);
@@ -590,5 +593,6 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
const union fscrypt_context *ctx_u,
int ctx_size);
+const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir);
#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 09fb8aa0f2e9..20b0df47fe6a 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -60,8 +60,8 @@ int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
if (err)
return err;
- /* ... in case we looked up ciphertext name before key was added */
- if (dentry->d_flags & DCACHE_ENCRYPTED_NAME)
+ /* ... in case we looked up no-key name before key was added */
+ if (dentry->d_flags & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (!fscrypt_has_permitted_context(dir, inode))
@@ -85,9 +85,8 @@ int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
if (err)
return err;
- /* ... in case we looked up ciphertext name(s) before key was added */
- if ((old_dentry->d_flags | new_dentry->d_flags) &
- DCACHE_ENCRYPTED_NAME)
+ /* ... in case we looked up no-key name(s) before key was added */
+ if ((old_dentry->d_flags | new_dentry->d_flags) & DCACHE_NOKEY_NAME)
return -ENOKEY;
if (old_dir != new_dir) {
@@ -114,9 +113,9 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
if (err && err != -ENOENT)
return err;
- if (fname->is_ciphertext_name) {
+ if (fname->is_nokey_name) {
spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_ENCRYPTED_NAME;
+ dentry->d_flags |= DCACHE_NOKEY_NAME;
spin_unlock(&dentry->d_lock);
d_set_d_op(dentry, &fscrypt_d_ops);
}
@@ -166,26 +165,51 @@ int fscrypt_prepare_setflags(struct inode *inode,
return 0;
}
-int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
- unsigned int max_len,
- struct fscrypt_str *disk_link)
+/**
+ * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink
+ * @dir: directory in which the symlink is being created
+ * @target: plaintext symlink target
+ * @len: length of @target excluding null terminator
+ * @max_len: space the filesystem has available to store the symlink target
+ * @disk_link: (out) the on-disk symlink target being prepared
+ *
+ * This function computes the size the symlink target will require on-disk,
+ * stores it in @disk_link->len, and validates it against @max_len. An
+ * encrypted symlink may be longer than the original.
+ *
+ * Additionally, @disk_link->name is set to @target if the symlink will be
+ * unencrypted, but left NULL if the symlink will be encrypted. For encrypted
+ * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the
+ * on-disk target later. (The reason for the two-step process is that some
+ * filesystems need to know the size of the symlink target before creating the
+ * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
+ *
+ * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
+ * -ENOKEY if the encryption key is missing, or another -errno code if a problem
+ * occurred while setting up the encryption key.
+ */
+int fscrypt_prepare_symlink(struct inode *dir, const char *target,
+ unsigned int len, unsigned int max_len,
+ struct fscrypt_str *disk_link)
{
- int err;
+ const union fscrypt_policy *policy;
/*
* To calculate the size of the encrypted symlink target we need to know
* the amount of NUL padding, which is determined by the flags set in
* the encryption policy which will be inherited from the directory.
- * The easiest way to get access to this is to just load the directory's
- * fscrypt_info, since we'll need it to create the dir_entry anyway.
- *
- * Note: in test_dummy_encryption mode, @dir may be unencrypted.
*/
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
- if (!fscrypt_has_encryption_key(dir))
- return -ENOKEY;
+ policy = fscrypt_policy_to_inherit(dir);
+ if (policy == NULL) {
+ /* Not encrypted */
+ disk_link->name = (unsigned char *)target;
+ disk_link->len = len + 1;
+ if (disk_link->len > max_len)
+ return -ENAMETOOLONG;
+ return 0;
+ }
+ if (IS_ERR(policy))
+ return PTR_ERR(policy);
/*
* Calculate the size of the encrypted symlink and verify it won't
@@ -198,7 +222,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
* counting it (even though it is meaningless for ciphertext) is simpler
* for now since filesystems will assume it is there and subtract it.
*/
- if (!fscrypt_fname_encrypted_size(dir, len,
+ if (!fscrypt_fname_encrypted_size(policy, len,
max_len - sizeof(struct fscrypt_symlink_data),
&disk_link->len))
return -ENAMETOOLONG;
@@ -207,7 +231,7 @@ int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len,
disk_link->name = NULL;
return 0;
}
-EXPORT_SYMBOL_GPL(__fscrypt_prepare_symlink);
+EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
unsigned int len, struct fscrypt_str *disk_link)
@@ -217,9 +241,13 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
struct fscrypt_symlink_data *sd;
unsigned int ciphertext_len;
- err = fscrypt_require_key(inode);
- if (err)
- return err;
+ /*
+ * fscrypt_prepare_new_inode() should have already set up the new
+ * symlink inode's encryption key. We don't wait until now to do it,
+ * since we may be in a filesystem transaction now.
+ */
+ if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode)))
+ return -ENOKEY;
if (disk_link->name) {
/* filesystem-provided buffer */
@@ -319,7 +347,7 @@ const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
if (cstr.len + sizeof(*sd) - 1 > max_size)
return ERR_PTR(-EUCLEAN);
- err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+ err = fscrypt_fname_alloc_buffer(cstr.len, &pstr);
if (err)
return ERR_PTR(err);
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index faa25541ccb6..89bffa82ed74 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -106,7 +106,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
crypto_cfg.data_unit_size = sb->s_blocksize;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
num_devs = fscrypt_get_num_devices(sb);
- devs = kmalloc_array(num_devs, sizeof(*devs), GFP_NOFS);
+ devs = kmalloc_array(num_devs, sizeof(*devs), GFP_KERNEL);
if (!devs)
return -ENOMEM;
fscrypt_get_devices(sb, num_devs, devs);
@@ -135,9 +135,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
struct fscrypt_blk_crypto_key *blk_key;
int err;
int i;
- unsigned int flags;
- blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_NOFS);
+ blk_key = kzalloc(struct_size(blk_key, devs, num_devs), GFP_KERNEL);
if (!blk_key)
return -ENOMEM;
@@ -166,10 +165,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
}
queue_refs++;
- flags = memalloc_nofs_save();
err = blk_crypto_start_using_key(&blk_key->base,
blk_key->devs[i]);
- memalloc_nofs_restore(flags);
if (err) {
fscrypt_err(inode,
"error %d starting to use blk-crypto", err);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index e74f239c4428..53cc552a7b8f 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -817,6 +817,7 @@ static int check_for_busy_inodes(struct super_block *sb,
struct list_head *pos;
size_t busy_count = 0;
unsigned long ino;
+ char ino_str[50] = "";
spin_lock(&mk->mk_decrypted_inodes_lock);
@@ -838,11 +839,15 @@ static int check_for_busy_inodes(struct super_block *sb,
}
spin_unlock(&mk->mk_decrypted_inodes_lock);
+ /* If the inode is currently being created, ino may still be 0. */
+ if (ino)
+ snprintf(ino_str, sizeof(ino_str), ", including ino %lu", ino);
+
fscrypt_warn(NULL,
- "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu",
+ "%s: %zu inode(s) still busy after removing key with %s %*phN%s",
sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec),
master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u,
- ino);
+ ino_str);
return -EBUSY;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index fea6226afc2b..d3c3e5d9b41f 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -10,6 +10,7 @@
#include <crypto/skcipher.h>
#include <linux/key.h>
+#include <linux/random.h>
#include "fscrypt_private.h"
@@ -222,6 +223,16 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci,
return 0;
}
+void fscrypt_hash_inode_number(struct fscrypt_info *ci,
+ const struct fscrypt_master_key *mk)
+{
+ WARN_ON(ci->ci_inode->i_ino == 0);
+ WARN_ON(!mk->mk_ino_hash_key_initialized);
+
+ ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
+ &mk->mk_ino_hash_key);
+}
+
static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci,
struct fscrypt_master_key *mk)
{
@@ -254,13 +265,20 @@ unlock:
return err;
}
- ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino,
- &mk->mk_ino_hash_key);
+ /*
+ * New inodes may not have an inode number assigned yet.
+ * Hashing their inode number is delayed until later.
+ */
+ if (ci->ci_inode->i_ino == 0)
+ WARN_ON(!(ci->ci_inode->i_state & I_CREATING));
+ else
+ fscrypt_hash_inode_number(ci, mk);
return 0;
}
static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
- struct fscrypt_master_key *mk)
+ struct fscrypt_master_key *mk,
+ bool need_dirhash_key)
{
int err;
@@ -306,7 +324,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
return err;
/* Derive a secret dirhash key for directories that need it. */
- if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) {
+ if (need_dirhash_key) {
err = fscrypt_derive_dirhash_key(ci, mk);
if (err)
return err;
@@ -326,6 +344,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci,
* key being removed with a new inode starting to use it.
*/
static int setup_file_encryption_key(struct fscrypt_info *ci,
+ bool need_dirhash_key,
struct key **master_key_ret)
{
struct key *key;
@@ -400,7 +419,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci,
err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
break;
case FSCRYPT_POLICY_V2:
- err = fscrypt_setup_v2_file_key(ci, mk);
+ err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key);
break;
default:
WARN_ON(1);
@@ -454,57 +473,28 @@ static void put_crypt_info(struct fscrypt_info *ci)
kmem_cache_free(fscrypt_info_cachep, ci);
}
-int fscrypt_get_encryption_info(struct inode *inode)
+static int
+fscrypt_setup_encryption_info(struct inode *inode,
+ const union fscrypt_policy *policy,
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
+ bool need_dirhash_key)
{
struct fscrypt_info *crypt_info;
- union fscrypt_context ctx;
struct fscrypt_mode *mode;
struct key *master_key = NULL;
int res;
- if (fscrypt_has_encryption_key(inode))
- return 0;
-
res = fscrypt_initialize(inode->i_sb->s_cop->flags);
if (res)
return res;
- res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
- if (res < 0) {
- const union fscrypt_context *dummy_ctx =
- fscrypt_get_dummy_context(inode->i_sb);
-
- if (IS_ENCRYPTED(inode) || !dummy_ctx) {
- fscrypt_warn(inode,
- "Error %d getting encryption context",
- res);
- return res;
- }
- /* Fake up a context for an unencrypted directory */
- res = fscrypt_context_size(dummy_ctx);
- memcpy(&ctx, dummy_ctx, res);
- }
-
- crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS);
+ crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL);
if (!crypt_info)
return -ENOMEM;
crypt_info->ci_inode = inode;
-
- res = fscrypt_policy_from_context(&crypt_info->ci_policy, &ctx, res);
- if (res) {
- fscrypt_warn(inode,
- "Unrecognized or corrupt encryption context");
- goto out;
- }
-
- memcpy(crypt_info->ci_nonce, fscrypt_context_nonce(&ctx),
- FSCRYPT_FILE_NONCE_SIZE);
-
- if (!fscrypt_supported_policy(&crypt_info->ci_policy, inode)) {
- res = -EINVAL;
- goto out;
- }
+ crypt_info->ci_policy = *policy;
+ memcpy(crypt_info->ci_nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
mode = select_encryption_mode(&crypt_info->ci_policy, inode);
if (IS_ERR(mode)) {
@@ -514,13 +504,14 @@ int fscrypt_get_encryption_info(struct inode *inode)
WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
- res = setup_file_encryption_key(crypt_info, &master_key);
+ res = setup_file_encryption_key(crypt_info, need_dirhash_key,
+ &master_key);
if (res)
goto out;
/*
- * Multiple tasks may race to set ->i_crypt_info, so use
- * cmpxchg_release(). This pairs with the smp_load_acquire() in
+ * For existing inodes, multiple tasks may race to set ->i_crypt_info.
+ * So use cmpxchg_release(). This pairs with the smp_load_acquire() in
* fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a
* RELEASE barrier so that other tasks can ACQUIRE it.
*/
@@ -550,14 +541,113 @@ out:
up_read(&mk->mk_secret_sem);
key_put(master_key);
}
+ put_crypt_info(crypt_info);
+ return res;
+}
+
+/**
+ * fscrypt_get_encryption_info() - set up an inode's encryption key
+ * @inode: the inode to set up the key for. Must be encrypted.
+ *
+ * Set up ->i_crypt_info, if it hasn't already been done.
+ *
+ * Note: unless ->i_crypt_info is already set, this isn't %GFP_NOFS-safe. So
+ * generally this shouldn't be called from within a filesystem transaction.
+ *
+ * Return: 0 if ->i_crypt_info was set or was already set, *or* if the
+ * encryption key is unavailable. (Use fscrypt_has_encryption_key() to
+ * distinguish these cases.) Also can return another -errno code.
+ */
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+ int res;
+ union fscrypt_context ctx;
+ union fscrypt_policy policy;
+
+ if (fscrypt_has_encryption_key(inode))
+ return 0;
+
+ res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+ if (res < 0) {
+ fscrypt_warn(inode, "Error %d getting encryption context", res);
+ return res;
+ }
+
+ res = fscrypt_policy_from_context(&policy, &ctx, res);
+ if (res) {
+ fscrypt_warn(inode,
+ "Unrecognized or corrupt encryption context");
+ return res;
+ }
+
+ if (!fscrypt_supported_policy(&policy, inode))
+ return -EINVAL;
+
+ res = fscrypt_setup_encryption_info(inode, &policy,
+ fscrypt_context_nonce(&ctx),
+ IS_CASEFOLDED(inode) &&
+ S_ISDIR(inode->i_mode));
if (res == -ENOKEY)
res = 0;
- put_crypt_info(crypt_info);
return res;
}
EXPORT_SYMBOL(fscrypt_get_encryption_info);
/**
+ * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
+ * @dir: a possibly-encrypted directory
+ * @inode: the new inode. ->i_mode must be set already.
+ * ->i_ino doesn't need to be set yet.
+ * @encrypt_ret: (output) set to %true if the new inode will be encrypted
+ *
+ * If the directory is encrypted, set up its ->i_crypt_info in preparation for
+ * encrypting the name of the new file. Also, if the new inode will be
+ * encrypted, set up its ->i_crypt_info and set *encrypt_ret=true.
+ *
+ * This isn't %GFP_NOFS-safe, and therefore it should be called before starting
+ * any filesystem transaction to create the inode. For this reason, ->i_ino
+ * isn't required to be set yet, as the filesystem may not have set it yet.
+ *
+ * This doesn't persist the new inode's encryption context. That still needs to
+ * be done later by calling fscrypt_set_context().
+ *
+ * Return: 0 on success, -ENOKEY if the encryption key is missing, or another
+ * -errno code
+ */
+int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
+ bool *encrypt_ret)
+{
+ const union fscrypt_policy *policy;
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
+
+ policy = fscrypt_policy_to_inherit(dir);
+ if (policy == NULL)
+ return 0;
+ if (IS_ERR(policy))
+ return PTR_ERR(policy);
+
+ if (WARN_ON_ONCE(inode->i_mode == 0))
+ return -EINVAL;
+
+ /*
+ * Only regular files, directories, and symlinks are encrypted.
+ * Special files like device nodes and named pipes aren't.
+ */
+ if (!S_ISREG(inode->i_mode) &&
+ !S_ISDIR(inode->i_mode) &&
+ !S_ISLNK(inode->i_mode))
+ return 0;
+
+ *encrypt_ret = true;
+
+ get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
+ return fscrypt_setup_encryption_info(inode, policy, nonce,
+ IS_CASEFOLDED(dir) &&
+ S_ISDIR(inode->i_mode));
+}
+EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode);
+
+/**
* fscrypt_put_encryption_info() - free most of an inode's fscrypt data
* @inode: an inode being evicted
*
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index a3cb52572b05..2762c5350432 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -60,7 +60,7 @@ static int derive_key_aes(const u8 *master_key,
goto out;
}
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- req = skcipher_request_alloc(tfm, GFP_NOFS);
+ req = skcipher_request_alloc(tfm, GFP_KERNEL);
if (!req) {
res = -ENOMEM;
goto out;
@@ -99,7 +99,7 @@ find_and_lock_process_key(const char *prefix,
const struct user_key_payload *ukp;
const struct fscrypt_key *payload;
- description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+ description = kasprintf(GFP_KERNEL, "%s%*phN", prefix,
FSCRYPT_KEY_DESCRIPTOR_SIZE, descriptor);
if (!description)
return ERR_PTR(-ENOMEM);
@@ -228,7 +228,7 @@ fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key)
return dk;
/* Nope, allocate one. */
- dk = kzalloc(sizeof(*dk), GFP_NOFS);
+ dk = kzalloc(sizeof(*dk), GFP_KERNEL);
if (!dk)
return ERR_PTR(-ENOMEM);
refcount_set(&dk->dk_refcount, 1);
@@ -272,7 +272,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci,
* This cannot be a stack buffer because it will be passed to the
* scatterlist crypto API during derive_key_aes().
*/
- derived_key = kmalloc(ci->ci_mode->keysize, GFP_NOFS);
+ derived_key = kmalloc(ci->ci_mode->keysize, GFP_KERNEL);
if (!derived_key)
return -ENOMEM;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 2d73fd39ad96..4441d9944b9e 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -32,6 +32,14 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
return !memcmp(policy1, policy2, fscrypt_policy_size(policy1));
}
+static const union fscrypt_policy *
+fscrypt_get_dummy_policy(struct super_block *sb)
+{
+ if (!sb->s_cop->get_dummy_policy)
+ return NULL;
+ return sb->s_cop->get_dummy_policy(sb);
+}
+
static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode)
{
if (contents_mode == FSCRYPT_MODE_AES_256_XTS &&
@@ -192,10 +200,15 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy,
32, 32))
return false;
+ /*
+ * IV_INO_LBLK_32 hashes the inode number, so in principle it can
+ * support any ino_bits. However, currently the inode number is gotten
+ * from inode::i_ino which is 'unsigned long'. So for now the
+ * implementation limit is 32 bits.
+ */
if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) &&
- /* This uses hashed inode numbers, so ino_bits doesn't matter. */
!supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32",
- INT_MAX, 32))
+ 32, 32))
return false;
if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) {
@@ -231,18 +244,19 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
}
/**
- * fscrypt_new_context_from_policy() - create a new fscrypt_context from
- * an fscrypt_policy
+ * fscrypt_new_context() - create a new fscrypt_context
* @ctx_u: output context
* @policy_u: input policy
+ * @nonce: nonce to use
*
* Create an fscrypt_context for an inode that is being assigned the given
- * encryption policy. A new nonce is randomly generated.
+ * encryption policy. @nonce must be a new random nonce.
*
* Return: the size of the new context in bytes.
*/
-static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
- const union fscrypt_policy *policy_u)
+static int fscrypt_new_context(union fscrypt_context *ctx_u,
+ const union fscrypt_policy *policy_u,
+ const u8 nonce[FSCRYPT_FILE_NONCE_SIZE])
{
memset(ctx_u, 0, sizeof(*ctx_u));
@@ -260,7 +274,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
memcpy(ctx->master_key_descriptor,
policy->master_key_descriptor,
sizeof(ctx->master_key_descriptor));
- get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
return sizeof(*ctx);
}
case FSCRYPT_POLICY_V2: {
@@ -276,7 +290,7 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u,
memcpy(ctx->master_key_identifier,
policy->master_key_identifier,
sizeof(ctx->master_key_identifier));
- get_random_bytes(ctx->nonce, sizeof(ctx->nonce));
+ memcpy(ctx->nonce, nonce, FSCRYPT_FILE_NONCE_SIZE);
return sizeof(*ctx);
}
}
@@ -372,6 +386,7 @@ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy)
static int set_encryption_policy(struct inode *inode,
const union fscrypt_policy *policy)
{
+ u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
union fscrypt_context ctx;
int ctxsize;
int err;
@@ -409,7 +424,8 @@ static int set_encryption_policy(struct inode *inode,
return -EINVAL;
}
- ctxsize = fscrypt_new_context_from_policy(&ctx, policy);
+ get_random_bytes(nonce, FSCRYPT_FILE_NONCE_SIZE);
+ ctxsize = fscrypt_new_context(&ctx, policy, nonce);
return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, NULL);
}
@@ -620,86 +636,99 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
}
EXPORT_SYMBOL(fscrypt_has_permitted_context);
+/*
+ * Return the encryption policy that new files in the directory will inherit, or
+ * NULL if none, or an ERR_PTR() on error. If the directory is encrypted, also
+ * ensure that its key is set up, so that the new filename can be encrypted.
+ */
+const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir)
+{
+ int err;
+
+ if (IS_ENCRYPTED(dir)) {
+ err = fscrypt_require_key(dir);
+ if (err)
+ return ERR_PTR(err);
+ return &dir->i_crypt_info->ci_policy;
+ }
+
+ return fscrypt_get_dummy_policy(dir->i_sb);
+}
+
/**
- * fscrypt_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child: Child inode that inherits the context from @parent.
- * @fs_data: private data given by FS.
- * @preload: preload child i_crypt_info if true
+ * fscrypt_set_context() - Set the fscrypt context of a new inode
+ * @inode: a new inode
+ * @fs_data: private data given by FS and passed to ->set_context()
+ *
+ * This should be called after fscrypt_prepare_new_inode(), generally during a
+ * filesystem transaction. Everything here must be %GFP_NOFS-safe.
*
* Return: 0 on success, -errno on failure
*/
-int fscrypt_inherit_context(struct inode *parent, struct inode *child,
- void *fs_data, bool preload)
+int fscrypt_set_context(struct inode *inode, void *fs_data)
{
+ struct fscrypt_info *ci = inode->i_crypt_info;
union fscrypt_context ctx;
int ctxsize;
- struct fscrypt_info *ci;
- int res;
-
- res = fscrypt_get_encryption_info(parent);
- if (res < 0)
- return res;
- ci = fscrypt_get_info(parent);
- if (ci == NULL)
+ /* fscrypt_prepare_new_inode() should have set up the key already. */
+ if (WARN_ON_ONCE(!ci))
return -ENOKEY;
- ctxsize = fscrypt_new_context_from_policy(&ctx, &ci->ci_policy);
-
BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE);
- res = parent->i_sb->s_cop->set_context(child, &ctx, ctxsize, fs_data);
- if (res)
- return res;
- return preload ? fscrypt_get_encryption_info(child): 0;
+ ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce);
+
+ /*
+ * This may be the first time the inode number is available, so do any
+ * delayed key setup that requires the inode number.
+ */
+ if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
+ (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
+ const struct fscrypt_master_key *mk =
+ ci->ci_master_key->payload.data[0];
+
+ fscrypt_hash_inode_number(ci, mk);
+ }
+
+ return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
}
-EXPORT_SYMBOL(fscrypt_inherit_context);
+EXPORT_SYMBOL_GPL(fscrypt_set_context);
/**
* fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption'
* @sb: the filesystem on which test_dummy_encryption is being specified
- * @arg: the argument to the test_dummy_encryption option.
- * If no argument was specified, then @arg->from == NULL.
- * @dummy_ctx: the filesystem's current dummy context (input/output, see below)
+ * @arg: the argument to the test_dummy_encryption option. May be NULL.
+ * @dummy_policy: the filesystem's current dummy policy (input/output, see
+ * below)
*
* Handle the test_dummy_encryption mount option by creating a dummy encryption
- * context, saving it in @dummy_ctx, and adding the corresponding dummy
- * encryption key to the filesystem. If the @dummy_ctx is already set, then
+ * policy, saving it in @dummy_policy, and adding the corresponding dummy
+ * encryption key to the filesystem. If the @dummy_policy is already set, then
* instead validate that it matches @arg. Don't support changing it via
* remount, as that is difficult to do safely.
*
- * The reason we use an fscrypt_context rather than an fscrypt_policy is because
- * we mustn't generate a new nonce each time we access a dummy-encrypted
- * directory, as that would change the way filenames are encrypted.
- *
- * Return: 0 on success (dummy context set, or the same context is already set);
- * -EEXIST if a different dummy context is already set;
+ * Return: 0 on success (dummy policy set, or the same policy is already set);
+ * -EEXIST if a different dummy policy is already set;
* or another -errno value.
*/
-int fscrypt_set_test_dummy_encryption(struct super_block *sb,
- const substring_t *arg,
- struct fscrypt_dummy_context *dummy_ctx)
+int fscrypt_set_test_dummy_encryption(struct super_block *sb, const char *arg,
+ struct fscrypt_dummy_policy *dummy_policy)
{
- const char *argstr = "v2";
- const char *argstr_to_free = NULL;
struct fscrypt_key_specifier key_spec = { 0 };
int version;
- union fscrypt_context *ctx = NULL;
+ union fscrypt_policy *policy = NULL;
int err;
- if (arg->from) {
- argstr = argstr_to_free = match_strdup(arg);
- if (!argstr)
- return -ENOMEM;
- }
+ if (!arg)
+ arg = "v2";
- if (!strcmp(argstr, "v1")) {
- version = FSCRYPT_CONTEXT_V1;
+ if (!strcmp(arg, "v1")) {
+ version = FSCRYPT_POLICY_V1;
key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR;
memset(key_spec.u.descriptor, 0x42,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
- } else if (!strcmp(argstr, "v2")) {
- version = FSCRYPT_CONTEXT_V2;
+ } else if (!strcmp(arg, "v2")) {
+ version = FSCRYPT_POLICY_V2;
key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
/* key_spec.u.identifier gets filled in when adding the key */
} else {
@@ -707,21 +736,8 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
goto out;
}
- if (dummy_ctx->ctx) {
- /*
- * Note: if we ever make test_dummy_encryption support
- * specifying other encryption settings, such as the encryption
- * modes, we'll need to compare those settings here.
- */
- if (dummy_ctx->ctx->version == version)
- err = 0;
- else
- err = -EEXIST;
- goto out;
- }
-
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
+ policy = kzalloc(sizeof(*policy), GFP_KERNEL);
+ if (!policy) {
err = -ENOMEM;
goto out;
}
@@ -730,18 +746,18 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
if (err)
goto out;
- ctx->version = version;
- switch (ctx->version) {
- case FSCRYPT_CONTEXT_V1:
- ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor,
+ policy->version = version;
+ switch (policy->version) {
+ case FSCRYPT_POLICY_V1:
+ policy->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ policy->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(policy->v1.master_key_descriptor, key_spec.u.descriptor,
FSCRYPT_KEY_DESCRIPTOR_SIZE);
break;
- case FSCRYPT_CONTEXT_V2:
- ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
- ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
- memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier,
+ case FSCRYPT_POLICY_V2:
+ policy->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS;
+ policy->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS;
+ memcpy(policy->v2.master_key_identifier, key_spec.u.identifier,
FSCRYPT_KEY_IDENTIFIER_SIZE);
break;
default:
@@ -749,12 +765,19 @@ int fscrypt_set_test_dummy_encryption(struct super_block *sb,
err = -EINVAL;
goto out;
}
- dummy_ctx->ctx = ctx;
- ctx = NULL;
+
+ if (dummy_policy->policy) {
+ if (fscrypt_policies_equal(policy, dummy_policy->policy))
+ err = 0;
+ else
+ err = -EEXIST;
+ goto out;
+ }
+ dummy_policy->policy = policy;
+ policy = NULL;
err = 0;
out:
- kfree(ctx);
- kfree(argstr_to_free);
+ kfree(policy);
return err;
}
EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
@@ -771,10 +794,16 @@ EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
struct super_block *sb)
{
- const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb);
+ const union fscrypt_policy *policy = fscrypt_get_dummy_policy(sb);
+ int vers;
- if (!ctx)
+ if (!policy)
return;
- seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version);
+
+ vers = policy->version;
+ if (vers == FSCRYPT_POLICY_V1) /* Handle numbering quirk */
+ vers = 1;
+
+ seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, vers);
}
EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index b167d2d02148..a768a09430c3 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -177,7 +177,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
goto out;
if (!fops_get(real_fops)) {
-#ifdef MODULE
+#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING)
goto out;
@@ -312,7 +312,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
goto out;
if (!fops_get(real_fops)) {
-#ifdef MODULE
+#ifdef CONFIG_MODULES
if (real_fops->owner &&
real_fops->owner->state == MODULE_STATE_GOING)
goto out;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 183299892465..abf535b036ab 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags);
}
-/**
- * dio_end_io - handle the end io action for the given bio
- * @bio: The direct io bio thats being completed
- *
- * This is meant to be called by any filesystem that uses their own dio_submit_t
- * so that the DIO specific endio actions are dealt with after the filesystem
- * has done it's completion work.
- */
-void dio_end_io(struct bio *bio)
-{
- struct dio *dio = bio->bi_private;
-
- if (dio->is_async)
- dio_bio_end_aio(bio);
- else
- dio_bio_end_io(bio);
-}
-EXPORT_SYMBOL_GPL(dio_end_io);
-
static inline void
dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
struct block_device *bdev,
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index f82a4952769d..ee92634196a8 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -4,6 +4,7 @@ menuconfig DLM
depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
select IP_SCTP
+ select SRCU
help
A general purpose distributed lock manager for kernel or userspace
applications.
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 47f0b98b707f..49c5f9407098 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item,
CONFIGFS_ATTR(cluster_, cluster_name);
static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
- int *info_field, int check_zero,
+ int *info_field, bool (*check_cb)(unsigned int x),
const char *buf, size_t len)
{
unsigned int x;
@@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
if (rc)
return rc;
- if (check_zero && !x)
+ if (check_cb && check_cb(x))
return -EINVAL;
*cl_field = x;
@@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
return len;
}
-#define CLUSTER_ATTR(name, check_zero) \
+#define CLUSTER_ATTR(name, check_cb) \
static ssize_t cluster_##name##_store(struct config_item *item, \
const char *buf, size_t len) \
{ \
struct dlm_cluster *cl = config_item_to_cluster(item); \
return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
- check_zero, buf, len); \
+ check_cb, buf, len); \
} \
static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
{ \
@@ -161,20 +161,30 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \
} \
CONFIGFS_ATTR(cluster_, name);
-CLUSTER_ATTR(tcp_port, 1);
-CLUSTER_ATTR(buffer_size, 1);
-CLUSTER_ATTR(rsbtbl_size, 1);
-CLUSTER_ATTR(recover_timer, 1);
-CLUSTER_ATTR(toss_secs, 1);
-CLUSTER_ATTR(scan_secs, 1);
-CLUSTER_ATTR(log_debug, 0);
-CLUSTER_ATTR(log_info, 0);
-CLUSTER_ATTR(protocol, 0);
-CLUSTER_ATTR(mark, 0);
-CLUSTER_ATTR(timewarn_cs, 1);
-CLUSTER_ATTR(waitwarn_us, 0);
-CLUSTER_ATTR(new_rsb_count, 0);
-CLUSTER_ATTR(recover_callbacks, 0);
+static bool dlm_check_zero(unsigned int x)
+{
+ return !x;
+}
+
+static bool dlm_check_buffer_size(unsigned int x)
+{
+ return (x < DEFAULT_BUFFER_SIZE);
+}
+
+CLUSTER_ATTR(tcp_port, dlm_check_zero);
+CLUSTER_ATTR(buffer_size, dlm_check_buffer_size);
+CLUSTER_ATTR(rsbtbl_size, dlm_check_zero);
+CLUSTER_ATTR(recover_timer, dlm_check_zero);
+CLUSTER_ATTR(toss_secs, dlm_check_zero);
+CLUSTER_ATTR(scan_secs, dlm_check_zero);
+CLUSTER_ATTR(log_debug, NULL);
+CLUSTER_ATTR(log_info, NULL);
+CLUSTER_ATTR(protocol, NULL);
+CLUSTER_ATTR(mark, NULL);
+CLUSTER_ATTR(timewarn_cs, dlm_check_zero);
+CLUSTER_ATTR(waitwarn_us, NULL);
+CLUSTER_ATTR(new_rsb_count, NULL);
+CLUSTER_ATTR(recover_callbacks, NULL);
static struct configfs_attribute *cluster_attrs[] = {
[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port,
@@ -221,6 +231,7 @@ struct dlm_space {
struct list_head members;
struct mutex members_lock;
int members_count;
+ struct dlm_nodes *nds;
};
struct dlm_comms {
@@ -430,6 +441,7 @@ static struct config_group *make_space(struct config_group *g, const char *name)
INIT_LIST_HEAD(&sp->members);
mutex_init(&sp->members_lock);
sp->members_count = 0;
+ sp->nds = nds;
return &sp->group;
fail:
@@ -451,6 +463,7 @@ static void drop_space(struct config_group *g, struct config_item *i)
static void release_space(struct config_item *i)
{
struct dlm_space *sp = config_item_to_space(i);
+ kfree(sp->nds);
kfree(sp);
}
@@ -857,18 +870,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq)
return 0;
}
-int dlm_comm_mark(int nodeid, unsigned int *mark)
+void dlm_comm_mark(int nodeid, unsigned int *mark)
{
struct dlm_comm *cm;
cm = get_comm(nodeid);
- if (!cm)
- return -ENOENT;
+ if (!cm) {
+ *mark = dlm_config.ci_mark;
+ return;
+ }
- *mark = cm->mark;
- put_comm(cm);
+ if (cm->mark)
+ *mark = cm->mark;
+ else
+ *mark = dlm_config.ci_mark;
- return 0;
+ put_comm(cm);
}
int dlm_our_nodeid(void)
@@ -889,7 +906,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
/* Config file defaults */
#define DEFAULT_TCP_PORT 21064
-#define DEFAULT_BUFFER_SIZE 4096
#define DEFAULT_RSBTBL_SIZE 1024
#define DEFAULT_RECOVER_TIMER 5
#define DEFAULT_TOSS_SECS 10
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index f62996cad561..c210250a2581 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -12,6 +12,8 @@
#ifndef __CONFIG_DOT_H__
#define __CONFIG_DOT_H__
+#define DEFAULT_BUFFER_SIZE 4096
+
struct dlm_config_node {
int nodeid;
int weight;
@@ -46,7 +48,7 @@ void dlm_config_exit(void);
int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
int *count_out);
int dlm_comm_seq(int nodeid, uint32_t *seq);
-int dlm_comm_mark(int nodeid, unsigned int *mark);
+void dlm_comm_mark(int nodeid, unsigned int *mark);
int dlm_our_nodeid(void);
int dlm_our_addr(struct sockaddr_storage *addr, int num);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 5050fe05769b..79f56f16bc2c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -65,40 +65,6 @@
#define MAX_SEND_MSG_COUNT 25
#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000)
-struct cbuf {
- unsigned int base;
- unsigned int len;
- unsigned int mask;
-};
-
-static void cbuf_add(struct cbuf *cb, int n)
-{
- cb->len += n;
-}
-
-static int cbuf_data(struct cbuf *cb)
-{
- return ((cb->base + cb->len) & cb->mask);
-}
-
-static void cbuf_init(struct cbuf *cb, int size)
-{
- cb->base = cb->len = 0;
- cb->mask = size-1;
-}
-
-static void cbuf_eat(struct cbuf *cb, int n)
-{
- cb->len -= n;
- cb->base += n;
- cb->base &= cb->mask;
-}
-
-static bool cbuf_empty(struct cbuf *cb)
-{
- return cb->len == 0;
-}
-
struct connection {
struct socket *sock; /* NULL if not connected */
uint32_t nodeid; /* So we know who we are in the list */
@@ -117,8 +83,6 @@ struct connection {
int (*rx_action) (struct connection *); /* What to do when active */
void (*connect_action) (struct connection *); /* What to do to connect */
void (*shutdown_action)(struct connection *con); /* What to do to shutdown */
- struct page *rx_page;
- struct cbuf cb;
int retries;
#define MAX_CONNECT_RETRIES 3
struct hlist_node list;
@@ -126,6 +90,10 @@ struct connection {
struct work_struct rwork; /* Receive workqueue */
struct work_struct swork; /* Send workqueue */
wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */
+ unsigned char *rx_buf;
+ int rx_buflen;
+ int rx_leftover;
+ struct rcu_head rcu;
};
#define sock2con(x) ((struct connection *)(x)->sk_user_data)
@@ -167,8 +135,8 @@ static struct workqueue_struct *recv_workqueue;
static struct workqueue_struct *send_workqueue;
static struct hlist_head connection_hash[CONN_HASH_SIZE];
-static DEFINE_MUTEX(connections_lock);
-static struct kmem_cache *con_cache;
+static DEFINE_SPINLOCK(connections_lock);
+DEFINE_STATIC_SRCU(connections_srcu);
static void process_recv_sockets(struct work_struct *work);
static void process_send_sockets(struct work_struct *work);
@@ -184,15 +152,20 @@ static inline int nodeid_hash(int nodeid)
static struct connection *__find_con(int nodeid)
{
- int r;
+ int r, idx;
struct connection *con;
r = nodeid_hash(nodeid);
- hlist_for_each_entry(con, &connection_hash[r], list) {
- if (con->nodeid == nodeid)
+ idx = srcu_read_lock(&connections_srcu);
+ hlist_for_each_entry_rcu(con, &connection_hash[r], list) {
+ if (con->nodeid == nodeid) {
+ srcu_read_unlock(&connections_srcu, idx);
return con;
+ }
}
+ srcu_read_unlock(&connections_srcu, idx);
+
return NULL;
}
@@ -200,21 +173,25 @@ static struct connection *__find_con(int nodeid)
* If 'allocation' is zero then we don't attempt to create a new
* connection structure for this node.
*/
-static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
+static struct connection *nodeid2con(int nodeid, gfp_t alloc)
{
- struct connection *con = NULL;
+ struct connection *con, *tmp;
int r;
con = __find_con(nodeid);
if (con || !alloc)
return con;
- con = kmem_cache_zalloc(con_cache, alloc);
+ con = kzalloc(sizeof(*con), alloc);
if (!con)
return NULL;
- r = nodeid_hash(nodeid);
- hlist_add_head(&con->list, &connection_hash[r]);
+ con->rx_buflen = dlm_config.ci_buffer_size;
+ con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS);
+ if (!con->rx_buf) {
+ kfree(con);
+ return NULL;
+ }
con->nodeid = nodeid;
mutex_init(&con->sock_mutex);
@@ -233,31 +210,41 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
con->rx_action = zerocon->rx_action;
}
+ r = nodeid_hash(nodeid);
+
+ spin_lock(&connections_lock);
+ /* Because multiple workqueues/threads calls this function it can
+ * race on multiple cpu's. Instead of locking hot path __find_con()
+ * we just check in rare cases of recently added nodes again
+ * under protection of connections_lock. If this is the case we
+ * abort our connection creation and return the existing connection.
+ */
+ tmp = __find_con(nodeid);
+ if (tmp) {
+ spin_unlock(&connections_lock);
+ kfree(con->rx_buf);
+ kfree(con);
+ return tmp;
+ }
+
+ hlist_add_head_rcu(&con->list, &connection_hash[r]);
+ spin_unlock(&connections_lock);
+
return con;
}
/* Loop round all connections */
static void foreach_conn(void (*conn_func)(struct connection *c))
{
- int i;
- struct hlist_node *n;
+ int i, idx;
struct connection *con;
+ idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE; i++) {
- hlist_for_each_entry_safe(con, n, &connection_hash[i], list)
+ hlist_for_each_entry_rcu(con, &connection_hash[i], list)
conn_func(con);
}
-}
-
-static struct connection *nodeid2con(int nodeid, gfp_t allocation)
-{
- struct connection *con;
-
- mutex_lock(&connections_lock);
- con = __nodeid2con(nodeid, allocation);
- mutex_unlock(&connections_lock);
-
- return con;
+ srcu_read_unlock(&connections_srcu, idx);
}
static struct dlm_node_addr *find_node_addr(int nodeid)
@@ -614,11 +601,8 @@ static void close_connection(struct connection *con, bool and_other,
/* Will only re-enter once. */
close_connection(con->othercon, false, true, true);
}
- if (con->rx_page) {
- __free_page(con->rx_page);
- con->rx_page = NULL;
- }
+ con->rx_leftover = 0;
con->retries = 0;
mutex_unlock(&con->sock_mutex);
clear_bit(CF_CLOSING, &con->flags);
@@ -672,16 +656,33 @@ static void dlm_tcp_shutdown(struct connection *con)
shutdown_connection(con);
}
+static int con_realloc_receive_buf(struct connection *con, int newlen)
+{
+ unsigned char *newbuf;
+
+ newbuf = kmalloc(newlen, GFP_NOFS);
+ if (!newbuf)
+ return -ENOMEM;
+
+ /* copy any leftover from last receive */
+ if (con->rx_leftover)
+ memmove(newbuf, con->rx_buf, con->rx_leftover);
+
+ /* swap to new buffer space */
+ kfree(con->rx_buf);
+ con->rx_buflen = newlen;
+ con->rx_buf = newbuf;
+
+ return 0;
+}
+
/* Data received from remote end */
static int receive_from_sock(struct connection *con)
{
- int ret = 0;
- struct msghdr msg = {};
- struct kvec iov[2];
- unsigned len;
- int r;
int call_again_soon = 0;
- int nvec;
+ struct msghdr msg;
+ struct kvec iov;
+ int ret, buflen;
mutex_lock(&con->sock_mutex);
@@ -689,71 +690,55 @@ static int receive_from_sock(struct connection *con)
ret = -EAGAIN;
goto out_close;
}
+
if (con->nodeid == 0) {
ret = -EINVAL;
goto out_close;
}
- if (con->rx_page == NULL) {
- /*
- * This doesn't need to be atomic, but I think it should
- * improve performance if it is.
- */
- con->rx_page = alloc_page(GFP_ATOMIC);
- if (con->rx_page == NULL)
+ /* realloc if we get new buffer size to read out */
+ buflen = dlm_config.ci_buffer_size;
+ if (con->rx_buflen != buflen && con->rx_leftover <= buflen) {
+ ret = con_realloc_receive_buf(con, buflen);
+ if (ret < 0)
goto out_resched;
- cbuf_init(&con->cb, PAGE_SIZE);
}
- /*
- * iov[0] is the bit of the circular buffer between the current end
- * point (cb.base + cb.len) and the end of the buffer.
- */
- iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
- iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
- iov[1].iov_len = 0;
- nvec = 1;
-
- /*
- * iov[1] is the bit of the circular buffer between the start of the
- * buffer and the start of the currently used section (cb.base)
+ /* calculate new buffer parameter regarding last receive and
+ * possible leftover bytes
*/
- if (cbuf_data(&con->cb) >= con->cb.base) {
- iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb);
- iov[1].iov_len = con->cb.base;
- iov[1].iov_base = page_address(con->rx_page);
- nvec = 2;
- }
- len = iov[0].iov_len + iov[1].iov_len;
- iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len);
+ iov.iov_base = con->rx_buf + con->rx_leftover;
+ iov.iov_len = con->rx_buflen - con->rx_leftover;
- r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL);
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+ ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len,
+ msg.msg_flags);
if (ret <= 0)
goto out_close;
- else if (ret == len)
+ else if (ret == iov.iov_len)
call_again_soon = 1;
- cbuf_add(&con->cb, ret);
- ret = dlm_process_incoming_buffer(con->nodeid,
- page_address(con->rx_page),
- con->cb.base, con->cb.len,
- PAGE_SIZE);
- if (ret < 0) {
- log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d",
- ret, page_address(con->rx_page), con->cb.base,
- con->cb.len, r);
- cbuf_eat(&con->cb, r);
- } else {
- cbuf_eat(&con->cb, ret);
- }
+ /* new buflen according readed bytes and leftover from last receive */
+ buflen = ret + con->rx_leftover;
+ ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen);
+ if (ret < 0)
+ goto out_close;
- if (cbuf_empty(&con->cb) && !call_again_soon) {
- __free_page(con->rx_page);
- con->rx_page = NULL;
+ /* calculate leftover bytes from process and put it into begin of
+ * the receive buffer, so next receive we have the full message
+ * at the start address of the receive buffer.
+ */
+ con->rx_leftover = buflen - ret;
+ if (con->rx_leftover) {
+ memmove(con->rx_buf, con->rx_buf + ret,
+ con->rx_leftover);
+ call_again_soon = true;
}
if (call_again_soon)
goto out_resched;
+
mutex_unlock(&con->sock_mutex);
return 0;
@@ -791,13 +776,11 @@ static int accept_from_sock(struct connection *con)
int nodeid;
struct connection *newcon;
struct connection *addcon;
+ unsigned int mark;
- mutex_lock(&connections_lock);
if (!dlm_allow_conn) {
- mutex_unlock(&connections_lock);
return -1;
}
- mutex_unlock(&connections_lock);
mutex_lock_nested(&con->sock_mutex, 0);
@@ -830,6 +813,9 @@ static int accept_from_sock(struct connection *con)
return -1;
}
+ dlm_comm_mark(nodeid, &mark);
+ sock_set_mark(newsock->sk, mark);
+
log_print("got connection from %d", nodeid);
/* Check to see if we already have a connection to this node. This
@@ -847,13 +833,24 @@ static int accept_from_sock(struct connection *con)
struct connection *othercon = newcon->othercon;
if (!othercon) {
- othercon = kmem_cache_zalloc(con_cache, GFP_NOFS);
+ othercon = kzalloc(sizeof(*othercon), GFP_NOFS);
if (!othercon) {
log_print("failed to allocate incoming socket");
mutex_unlock(&newcon->sock_mutex);
result = -ENOMEM;
goto accept_err;
}
+
+ othercon->rx_buflen = dlm_config.ci_buffer_size;
+ othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS);
+ if (!othercon->rx_buf) {
+ mutex_unlock(&newcon->sock_mutex);
+ kfree(othercon);
+ log_print("failed to allocate incoming socket receive buffer");
+ result = -ENOMEM;
+ goto accept_err;
+ }
+
othercon->nodeid = nodeid;
othercon->rx_action = receive_from_sock;
mutex_init(&othercon->sock_mutex);
@@ -975,6 +972,8 @@ static void sctp_connect_to_sock(struct connection *con)
return;
}
+ dlm_comm_mark(con->nodeid, &mark);
+
mutex_lock(&con->sock_mutex);
/* Some odd races can cause double-connects, ignore them */
@@ -999,11 +998,6 @@ static void sctp_connect_to_sock(struct connection *con)
if (result < 0)
goto socket_err;
- /* set skb mark */
- result = dlm_comm_mark(con->nodeid, &mark);
- if (result < 0)
- goto bind_err;
-
sock_set_mark(sock->sk, mark);
con->rx_action = receive_from_sock;
@@ -1076,6 +1070,8 @@ static void tcp_connect_to_sock(struct connection *con)
return;
}
+ dlm_comm_mark(con->nodeid, &mark);
+
mutex_lock(&con->sock_mutex);
if (con->retries++ > MAX_CONNECT_RETRIES)
goto out;
@@ -1090,11 +1086,6 @@ static void tcp_connect_to_sock(struct connection *con)
if (result < 0)
goto out_err;
- /* set skb mark */
- result = dlm_comm_mark(con->nodeid, &mark);
- if (result < 0)
- goto out_err;
-
sock_set_mark(sock->sk, mark);
memset(&saddr, 0, sizeof(saddr));
@@ -1238,6 +1229,14 @@ static void init_local(void)
}
}
+static void deinit_local(void)
+{
+ int i;
+
+ for (i = 0; i < dlm_local_count; i++)
+ kfree(dlm_local_addr[i]);
+}
+
/* Initialise SCTP socket and bind to all interfaces */
static int sctp_listen_for_all(void)
{
@@ -1546,13 +1545,6 @@ static void process_send_sockets(struct work_struct *work)
send_to_sock(con);
}
-
-/* Discard all entries on the write queues */
-static void clean_writequeues(void)
-{
- foreach_conn(clean_one_writequeue);
-}
-
static void work_stop(void)
{
if (recv_workqueue)
@@ -1608,26 +1600,34 @@ static void shutdown_conn(struct connection *con)
con->shutdown_action(con);
}
+static void connection_release(struct rcu_head *rcu)
+{
+ struct connection *con = container_of(rcu, struct connection, rcu);
+
+ kfree(con->rx_buf);
+ kfree(con);
+}
+
static void free_conn(struct connection *con)
{
close_connection(con, true, true, true);
- if (con->othercon)
- kmem_cache_free(con_cache, con->othercon);
- hlist_del(&con->list);
- kmem_cache_free(con_cache, con);
+ spin_lock(&connections_lock);
+ hlist_del_rcu(&con->list);
+ spin_unlock(&connections_lock);
+ if (con->othercon) {
+ clean_one_writequeue(con->othercon);
+ call_rcu(&con->othercon->rcu, connection_release);
+ }
+ clean_one_writequeue(con);
+ call_rcu(&con->rcu, connection_release);
}
static void work_flush(void)
{
- int ok;
+ int ok, idx;
int i;
- struct hlist_node *n;
struct connection *con;
- if (recv_workqueue)
- flush_workqueue(recv_workqueue);
- if (send_workqueue)
- flush_workqueue(send_workqueue);
do {
ok = 1;
foreach_conn(stop_conn);
@@ -1635,9 +1635,10 @@ static void work_flush(void)
flush_workqueue(recv_workqueue);
if (send_workqueue)
flush_workqueue(send_workqueue);
+ idx = srcu_read_lock(&connections_srcu);
for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
- hlist_for_each_entry_safe(con, n,
- &connection_hash[i], list) {
+ hlist_for_each_entry_rcu(con, &connection_hash[i],
+ list) {
ok &= test_bit(CF_READ_PENDING, &con->flags);
ok &= test_bit(CF_WRITE_PENDING, &con->flags);
if (con->othercon) {
@@ -1648,6 +1649,7 @@ static void work_flush(void)
}
}
}
+ srcu_read_unlock(&connections_srcu, idx);
} while (!ok);
}
@@ -1656,16 +1658,18 @@ void dlm_lowcomms_stop(void)
/* Set all the flags to prevent any
socket activity.
*/
- mutex_lock(&connections_lock);
dlm_allow_conn = 0;
- mutex_unlock(&connections_lock);
+
+ if (recv_workqueue)
+ flush_workqueue(recv_workqueue);
+ if (send_workqueue)
+ flush_workqueue(send_workqueue);
+
foreach_conn(shutdown_conn);
work_flush();
- clean_writequeues();
foreach_conn(free_conn);
work_stop();
-
- kmem_cache_destroy(con_cache);
+ deinit_local();
}
int dlm_lowcomms_start(void)
@@ -1684,16 +1688,9 @@ int dlm_lowcomms_start(void)
goto fail;
}
- error = -ENOMEM;
- con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
- __alignof__(struct connection), 0,
- NULL);
- if (!con_cache)
- goto fail;
-
error = work_start();
if (error)
- goto fail_destroy;
+ goto fail;
dlm_allow_conn = 1;
@@ -1710,12 +1707,8 @@ int dlm_lowcomms_start(void)
fail_unlisten:
dlm_allow_conn = 0;
con = nodeid2con(0,0);
- if (con) {
- close_connection(con, false, true, true);
- kmem_cache_free(con_cache, con);
- }
-fail_destroy:
- kmem_cache_destroy(con_cache);
+ if (con)
+ free_conn(con);
fail:
return error;
}
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index 921322d133e3..fde3a6afe4be 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -22,114 +22,84 @@
* into packets and sends them to the comms layer.
*/
+#include <asm/unaligned.h>
+
#include "dlm_internal.h"
#include "lowcomms.h"
#include "config.h"
#include "lock.h"
#include "midcomms.h"
-
-static void copy_from_cb(void *dst, const void *base, unsigned offset,
- unsigned len, unsigned limit)
-{
- unsigned copy = len;
-
- if ((copy + offset) > limit)
- copy = limit - offset;
- memcpy(dst, base + offset, copy);
- len -= copy;
- if (len)
- memcpy(dst + copy, base, len);
-}
-
/*
* Called from the low-level comms layer to process a buffer of
* commands.
- *
- * Only complete messages are processed here, any "spare" bytes from
- * the end of a buffer are saved and tacked onto the front of the next
- * message that comes in. I doubt this will happen very often but we
- * need to be able to cope with it and I don't want the task to be waiting
- * for packets to come in when there is useful work to be done.
*/
-int dlm_process_incoming_buffer(int nodeid, const void *base,
- unsigned offset, unsigned len, unsigned limit)
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len)
{
- union {
- unsigned char __buf[DLM_INBUF_LEN];
- /* this is to force proper alignment on some arches */
- union dlm_packet p;
- } __tmp;
- union dlm_packet *p = &__tmp.p;
- int ret = 0;
- int err = 0;
+ const unsigned char *ptr = buf;
+ const struct dlm_header *hd;
uint16_t msglen;
- uint32_t lockspace;
-
- while (len > sizeof(struct dlm_header)) {
-
- /* Copy just the header to check the total length. The
- message may wrap around the end of the buffer back to the
- start, so we need to use a temp buffer and copy_from_cb. */
-
- copy_from_cb(p, base, offset, sizeof(struct dlm_header),
- limit);
-
- msglen = le16_to_cpu(p->header.h_length);
- lockspace = p->header.h_lockspace;
+ int ret = 0;
- err = -EINVAL;
- if (msglen < sizeof(struct dlm_header))
- break;
- if (p->header.h_cmd == DLM_MSG) {
- if (msglen < sizeof(struct dlm_message))
- break;
- } else {
- if (msglen < sizeof(struct dlm_rcom))
- break;
- }
- err = -E2BIG;
- if (msglen > dlm_config.ci_buffer_size) {
- log_print("message size %d from %d too big, buf len %d",
- msglen, nodeid, len);
- break;
+ while (len >= sizeof(struct dlm_header)) {
+ hd = (struct dlm_header *)ptr;
+
+ /* no message should be more than this otherwise we
+ * cannot deliver this message to upper layers
+ */
+ msglen = get_unaligned_le16(&hd->h_length);
+ if (msglen > DEFAULT_BUFFER_SIZE) {
+ log_print("received invalid length header: %u, will abort message parsing",
+ msglen);
+ return -EBADMSG;
}
- err = 0;
-
- /* If only part of the full message is contained in this
- buffer, then do nothing and wait for lowcomms to call
- us again later with more data. We return 0 meaning
- we've consumed none of the input buffer. */
+ /* caller will take care that leftover
+ * will be parsed next call with more data
+ */
if (msglen > len)
break;
- /* Allocate a larger temp buffer if the full message won't fit
- in the buffer on the stack (which should work for most
- ordinary messages). */
-
- if (msglen > sizeof(__tmp) && p == &__tmp.p) {
- p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
- if (p == NULL)
- return ret;
- }
+ switch (hd->h_cmd) {
+ case DLM_MSG:
+ if (msglen < sizeof(struct dlm_message)) {
+ log_print("dlm msg too small: %u, will skip this message",
+ msglen);
+ goto skip;
+ }
- copy_from_cb(p, base, offset, msglen, limit);
+ break;
+ case DLM_RCOM:
+ if (msglen < sizeof(struct dlm_rcom)) {
+ log_print("dlm rcom msg too small: %u, will skip this message",
+ msglen);
+ goto skip;
+ }
- BUG_ON(lockspace != p->header.h_lockspace);
+ break;
+ default:
+ log_print("unsupported h_cmd received: %u, will skip this message",
+ hd->h_cmd);
+ goto skip;
+ }
+ /* for aligned memory access, we just copy current message
+ * to begin of the buffer which contains already parsed buffer
+ * data and should provide align access for upper layers
+ * because the start address of the buffer has a aligned
+ * address. This memmove can be removed when the upperlayer
+ * is capable of unaligned memory access.
+ */
+ memmove(buf, ptr, msglen);
+ dlm_receive_buffer((union dlm_packet *)buf, nodeid);
+
+skip:
ret += msglen;
- offset += msglen;
- offset &= (limit - 1);
len -= msglen;
-
- dlm_receive_buffer(p, nodeid);
+ ptr += msglen;
}
- if (p != &__tmp.p)
- kfree(p);
-
- return err ? err : ret;
+ return ret;
}
diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h
index 2e122e81c8d0..61e90a921849 100644
--- a/fs/dlm/midcomms.h
+++ b/fs/dlm/midcomms.h
@@ -12,8 +12,7 @@
#ifndef __MIDCOMMS_DOT_H__
#define __MIDCOMMS_DOT_H__
-int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset,
- unsigned len, unsigned limit);
+int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen);
#endif /* __MIDCOMMS_DOT_H__ */
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 28bb5689333a..15880a68faad 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -141,6 +141,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+ /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */
+ strreplace(name, '/', '!');
+
inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0,
is_removable);
if (!inode)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 459ecb42cbd3..347be146884c 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -224,7 +224,7 @@ submit_bio_retry:
bio_set_dev(bio, sb->s_bdev);
bio->bi_iter.bi_sector = (sector_t)blknr <<
LOG_SECTORS_PER_BLOCK;
- bio->bi_opf = REQ_OP_READ;
+ bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0);
}
err = bio_add_page(bio, page, PAGE_SIZE, 0);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ddaa516c008a..b9a09806512a 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx)
enum {
Opt_user_xattr,
- Opt_nouser_xattr,
Opt_acl,
- Opt_noacl,
Opt_cache_strategy,
Opt_err
};
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index c8c381eadcd6..5bde77d70852 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
break;
case EROFS_XATTR_INDEX_SECURITY:
break;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 6c939def00f9..50912a5420b4 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend {
struct z_erofs_collector clt;
struct erofs_map_blocks map;
+ bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
@@ -153,8 +154,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
static void preload_compressed_pages(struct z_erofs_collector *clt,
struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+ enum z_erofs_cache_alloctype type)
{
const struct z_erofs_pcluster *pcl = clt->pcl;
const unsigned int clusterpages = BIT(pcl->clusterbits);
@@ -562,8 +562,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page,
- struct list_head *pagepool)
+ struct page *page)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -620,8 +619,7 @@ restart_now:
else
cache_strategy = DONTALLOC;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy);
hitted:
/*
@@ -653,7 +651,7 @@ retry:
/* should allocate an additional staging page for pagevec */
if (err == -EAGAIN) {
struct page *const newpage =
- erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL);
+ alloc_page(GFP_NOFS | __GFP_NOFAIL);
newpage->mapping = Z_EROFS_MAPPING_STAGING;
err = z_erofs_attach_page(clt, newpage,
@@ -1151,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
}
static void z_erofs_submit_queue(struct super_block *sb,
- z_erofs_next_pcluster_t owned_head,
+ struct z_erofs_decompress_frontend *f,
struct list_head *pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
@@ -1160,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
+ z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
/* since bio will be NULL, no need to initialize last_index */
pgoff_t last_index;
unsigned int nr_bios = 0;
@@ -1193,7 +1192,6 @@ static void z_erofs_submit_queue(struct super_block *sb,
do {
struct page *page;
- int err;
page = pickup_page_for_submission(pcl, i++, pagepool,
MNGD_MAPPING(sbi),
@@ -1216,11 +1214,12 @@ submit_bio_retry:
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
bio->bi_opf = REQ_OP_READ;
+ if (f->readahead)
+ bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
- err = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (err < PAGE_SIZE)
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
goto submit_bio_retry;
last_index = cur;
@@ -1248,14 +1247,14 @@ submit_bio_retry:
}
static void z_erofs_runqueue(struct super_block *sb,
- struct z_erofs_collector *clt,
+ struct z_erofs_decompress_frontend *f,
struct list_head *pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg);
+ z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
/* handle bypass queue (no i/o pclusters) immediately */
z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
@@ -1282,11 +1281,11 @@ static int z_erofs_readpage(struct file *file, struct page *page)
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
(void)z_erofs_collector_end(&f.clt);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool, true);
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
@@ -1299,25 +1298,20 @@ static int z_erofs_readpage(struct file *file, struct page *page)
return err;
}
-static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
- unsigned int nr)
-{
- return nr <= sbi->ctx.max_sync_decompress_pages;
-}
-
static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
- bool sync = should_decompress_synchronously(sbi, readahead_count(rac));
+ unsigned int nr_pages = readahead_count(rac);
+ bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
struct page *page, *head = NULL;
LIST_HEAD(pagepool);
- trace_erofs_readpages(inode, readahead_index(rac),
- readahead_count(rac), false);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ f.readahead = true;
f.headoffset = readahead_pos(rac);
while ((page = readahead_page(rac))) {
@@ -1341,7 +1335,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
/* traversal in reverse order */
head = (void *)page_private(page);
- err = z_erofs_do_read_page(&f, page, &pagepool);
+ err = z_erofs_do_read_page(&f, page);
if (err)
erofs_err(inode->i_sb,
"readahead error at page %lu @ nid %llu",
@@ -1351,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
(void)z_erofs_collector_end(&f.clt);
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync);
+ z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
if (f.map.mpage)
put_page(f.map.mpage);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e0decff22ae2..4df61129566d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,8 +218,7 @@ struct eventpoll {
struct file *file;
/* used to optimize loop detection check */
- struct list_head visited_list_link;
- int visited;
+ u64 gen;
#ifdef CONFIG_NET_RX_BUSY_POLL
/* used to track busy poll napi_id */
@@ -274,6 +273,8 @@ static long max_user_watches __read_mostly;
*/
static DEFINE_MUTEX(epmutex);
+static u64 loop_check_gen = 0;
+
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;
@@ -283,9 +284,6 @@ static struct kmem_cache *epi_cache __read_mostly;
/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __read_mostly;
-/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
-static LIST_HEAD(visited_list);
-
/*
* List of files with newly added links, where we may need to limit the number
* of emanating paths. Protected by the epmutex.
@@ -1450,7 +1448,7 @@ static int reverse_path_check(void)
static int ep_create_wakeup_source(struct epitem *epi)
{
- const char *name;
+ struct name_snapshot n;
struct wakeup_source *ws;
if (!epi->ep->ws) {
@@ -1459,8 +1457,9 @@ static int ep_create_wakeup_source(struct epitem *epi)
return -ENOMEM;
}
- name = epi->ffd.file->f_path.dentry->d_name.name;
- ws = wakeup_source_register(NULL, name);
+ take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
+ ws = wakeup_source_register(NULL, n.name.name);
+ release_dentry_name_snapshot(&n);
if (!ws)
return -ENOMEM;
@@ -1522,6 +1521,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
RCU_INIT_POINTER(epi->ws, NULL);
}
+ /* Add the current item to the list of active epoll hook for this file */
+ spin_lock(&tfile->f_lock);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
+ spin_unlock(&tfile->f_lock);
+
+ /*
+ * Add the current item to the RB tree. All RB tree operations are
+ * protected by "mtx", and ep_insert() is called with "mtx" held.
+ */
+ ep_rbtree_insert(ep, epi);
+
+ /* now check if we've created too many backpaths */
+ error = -EINVAL;
+ if (full_check && reverse_path_check())
+ goto error_remove_epi;
+
/* Initialize the poll table using the queue callback */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
@@ -1544,22 +1559,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
if (epi->nwait < 0)
goto error_unregister;
- /* Add the current item to the list of active epoll hook for this file */
- spin_lock(&tfile->f_lock);
- list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
- spin_unlock(&tfile->f_lock);
-
- /*
- * Add the current item to the RB tree. All RB tree operations are
- * protected by "mtx", and ep_insert() is called with "mtx" held.
- */
- ep_rbtree_insert(ep, epi);
-
- /* now check if we've created too many backpaths */
- error = -EINVAL;
- if (full_check && reverse_path_check())
- goto error_remove_epi;
-
/* We have to drop the new item inside our item list to keep track of it */
write_lock_irq(&ep->lock);
@@ -1588,6 +1587,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
return 0;
+error_unregister:
+ ep_unregister_pollwait(ep, epi);
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
@@ -1595,9 +1596,6 @@ error_remove_epi:
rb_erase_cached(&epi->rbn, &ep->rbr);
-error_unregister:
- ep_unregister_pollwait(ep, epi);
-
/*
* We need to do this because an event could have been arrived on some
* allocated wait queue. Note that we don't care about the ep->ovflist
@@ -1972,13 +1970,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
struct epitem *epi;
mutex_lock_nested(&ep->mtx, call_nests + 1);
- ep->visited = 1;
- list_add(&ep->visited_list_link, &visited_list);
+ ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
epi = rb_entry(rbp, struct epitem, rbn);
if (unlikely(is_file_epoll(epi->ffd.file))) {
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit->visited)
+ if (ep_tovisit->gen == loop_check_gen)
continue;
error = ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, epi->ffd.file,
@@ -1995,9 +1992,9 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
* during ep_insert().
*/
if (list_empty(&epi->ffd.file->f_tfile_llink)) {
- get_file(epi->ffd.file);
- list_add(&epi->ffd.file->f_tfile_llink,
- &tfile_check_list);
+ if (get_file_rcu(epi->ffd.file))
+ list_add(&epi->ffd.file->f_tfile_llink,
+ &tfile_check_list);
}
}
}
@@ -2019,18 +2016,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
*/
static int ep_loop_check(struct eventpoll *ep, struct file *file)
{
- int ret;
- struct eventpoll *ep_cur, *ep_next;
-
- ret = ep_call_nested(&poll_loop_ncalls,
+ return ep_call_nested(&poll_loop_ncalls,
ep_loop_check_proc, file, ep, current);
- /* clear visited list */
- list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
- visited_list_link) {
- ep_cur->visited = 0;
- list_del(&ep_cur->visited_list_link);
- }
- return ret;
}
static void clear_tfile_check_list(void)
@@ -2195,11 +2182,13 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
+ ep->gen == loop_check_gen ||
is_file_epoll(tf.file)) {
mutex_unlock(&ep->mtx);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
+ loop_check_gen++;
full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
@@ -2263,6 +2252,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
error_tgt_fput:
if (full_check) {
clear_tfile_check_list();
+ loop_check_gen++;
mutex_unlock(&epmutex);
}
diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c
index 03d0824fc368..5a2f119b7e8c 100644
--- a/fs/exfat/cache.c
+++ b/fs/exfat/cache.c
@@ -17,7 +17,6 @@
#include "exfat_raw.h"
#include "exfat_fs.h"
-#define EXFAT_CACHE_VALID 0
#define EXFAT_MAX_CACHE 16
struct exfat_cache {
@@ -61,16 +60,6 @@ void exfat_cache_shutdown(void)
kmem_cache_destroy(exfat_cachep);
}
-void exfat_cache_init_inode(struct inode *inode)
-{
- struct exfat_inode_info *ei = EXFAT_I(inode);
-
- spin_lock_init(&ei->cache_lru_lock);
- ei->nr_caches = 0;
- ei->cache_valid_id = EXFAT_CACHE_VALID + 1;
- INIT_LIST_HEAD(&ei->cache_lru);
-}
-
static inline struct exfat_cache *exfat_cache_alloc(void)
{
return kmem_cache_alloc(exfat_cachep, GFP_NOFS);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 95d717f8620c..c013fe931d9c 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -248,6 +248,8 @@ struct exfat_sb_info {
struct rcu_head rcu;
};
+#define EXFAT_CACHE_VALID 0
+
/*
* EXFAT file system inode in-memory data
*/
@@ -428,7 +430,6 @@ extern const struct dentry_operations exfat_utf8_dentry_ops;
/* cache.c */
int exfat_cache_init(void);
void exfat_cache_shutdown(void);
-void exfat_cache_init_inode(struct inode *inode);
void exfat_cache_inval_inode(struct inode *inode);
int exfat_get_cluster(struct inode *inode, unsigned int cluster,
unsigned int *fclus, unsigned int *dclus,
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 7f90204adef5..a6de17cac3df 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -611,8 +611,6 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
ei->i_crtime = info->crtime;
inode->i_atime = info->atime;
- exfat_cache_init_inode(inode);
-
return 0;
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index e73f20f66cb2..c94ac239f740 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -578,7 +578,8 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode))
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
inode_inc_iversion(inode);
@@ -745,10 +746,9 @@ static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
- }
i_mode = inode->i_mode;
alias = d_find_alias(inode);
@@ -890,10 +890,9 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
i_pos = exfat_make_i_pos(&info);
inode = exfat_build_inode(sb, &info, i_pos);
- if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
+ err = PTR_ERR_OR_ZERO(inode);
+ if (err)
goto unlock;
- }
inode_inc_iversion(inode);
inode->i_mtime = inode->i_atime = inode->i_ctime =
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 3b6a1659892f..60b941ba557b 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -376,7 +376,6 @@ static int exfat_read_root(struct inode *inode)
inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
current_time(inode);
exfat_truncate_atime(&inode->i_atime);
- exfat_cache_init_inode(inode);
return 0;
}
@@ -763,6 +762,10 @@ static void exfat_inode_init_once(void *foo)
{
struct exfat_inode_info *ei = (struct exfat_inode_info *)foo;
+ spin_lock_init(&ei->cache_lru_lock);
+ ei->nr_caches = 0;
+ ei->cache_valid_id = EXFAT_CACHE_VALID + 1;
+ INIT_LIST_HEAD(&ei->cache_lru);
INIT_HLIST_NODE(&ei->i_hash_fat);
inode_init_once(&ei->vfs_inode);
}
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 60378ddf1424..96044f5dbc0e 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -93,8 +93,10 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
struct inode *inode = file_inode(vmf->vma->vm_file);
struct ext2_inode_info *ei = EXT2_I(inode);
vm_fault_t ret;
+ bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
- if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (write) {
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
}
@@ -103,7 +105,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops);
up_read(&ei->dax_sem);
- if (vmf->flags & FAULT_FLAG_WRITE)
+ if (write)
sb_end_pagefault(inode->i_sb);
return ret;
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1d82336b1cd4..efe77cffc322 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -148,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
}
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_fname_alloc_buffer(inode, EXT4_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN, &fstr);
if (err < 0)
return err;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 523e00d7b392..f9a692c0a66c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1401,7 +1401,7 @@ struct ext4_super_block {
#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL)
+#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
#else
#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
#endif
@@ -1596,8 +1596,8 @@ struct ext4_sb_info {
atomic_t s_warning_count;
atomic_t s_msg_count;
- /* Encryption context for '-o test_dummy_encryption' */
- struct fscrypt_dummy_context s_dummy_enc_ctx;
+ /* Encryption policy for '-o test_dummy_encryption' */
+ struct fscrypt_dummy_policy s_dummy_enc_policy;
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index df25d38d6539..698ca4a4db5f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -742,6 +742,53 @@ not_found:
return 1;
}
+static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode,
+ bool encrypt)
+{
+ struct super_block *sb = dir->i_sb;
+ int nblocks = 0;
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+ struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+ if (p) {
+ int acl_size = p->a_count * sizeof(ext4_acl_entry);
+
+ nblocks += (S_ISDIR(mode) ? 2 : 1) *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, acl_size,
+ true /* is_create */);
+ posix_acl_release(p);
+ }
+#endif
+
+#ifdef CONFIG_SECURITY
+ {
+ int num_security_xattrs = 1;
+
+#ifdef CONFIG_INTEGRITY
+ num_security_xattrs++;
+#endif
+ /*
+ * We assume that security xattrs are never more than 1k.
+ * In practice they are under 128 bytes.
+ */
+ nblocks += num_security_xattrs *
+ __ext4_xattr_set_credits(sb, NULL /* inode */,
+ NULL /* block_bh */, 1024,
+ true /* is_create */);
+ }
+#endif
+ if (encrypt)
+ nblocks += __ext4_xattr_set_credits(sb,
+ NULL /* inode */,
+ NULL /* block_bh */,
+ FSCRYPT_SET_CONTEXT_MAX_SIZE,
+ true /* is_create */);
+ return nblocks;
+}
+
/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
@@ -772,7 +819,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_group_t i;
ext4_group_t flex_group;
struct ext4_group_info *grp;
- int encrypt = 0;
+ bool encrypt = false;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -784,59 +831,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (unlikely(ext4_forced_shutdown(sbi)))
return ERR_PTR(-EIO);
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) &&
- !(i_flags & EXT4_EA_INODE_FL)) {
- err = fscrypt_get_encryption_info(dir);
- if (err)
- return ERR_PTR(err);
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-ENOKEY);
- encrypt = 1;
- }
-
- if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
- struct posix_acl *p = get_acl(dir, ACL_TYPE_DEFAULT);
-
- if (IS_ERR(p))
- return ERR_CAST(p);
- if (p) {
- int acl_size = p->a_count * sizeof(ext4_acl_entry);
-
- nblocks += (S_ISDIR(mode) ? 2 : 1) *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, acl_size,
- true /* is_create */);
- posix_acl_release(p);
- }
-#endif
-
-#ifdef CONFIG_SECURITY
- {
- int num_security_xattrs = 1;
-
-#ifdef CONFIG_INTEGRITY
- num_security_xattrs++;
-#endif
- /*
- * We assume that security xattrs are never
- * more than 1k. In practice they are under
- * 128 bytes.
- */
- nblocks += num_security_xattrs *
- __ext4_xattr_set_credits(sb, NULL /* inode */,
- NULL /* block_bh */, 1024,
- true /* is_create */);
- }
-#endif
- if (encrypt)
- nblocks += __ext4_xattr_set_credits(sb,
- NULL /* inode */, NULL /* block_bh */,
- FSCRYPT_SET_CONTEXT_MAX_SIZE,
- true /* is_create */);
- }
-
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
@@ -866,10 +860,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
else
ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+ if (!(i_flags & EXT4_EA_INODE_FL)) {
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto out;
+ }
+
err = dquot_initialize(inode);
if (err)
goto out;
+ if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
+ ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
+ if (ret2 < 0) {
+ err = ret2;
+ goto out;
+ }
+ nblocks += ret2;
+ }
+
if (!goal)
goal = sbi->s_inode_goal;
@@ -1162,7 +1171,7 @@ got:
* prevent its deduplication.
*/
if (encrypt) {
- err = fscrypt_inherit_context(dir, inode, handle, true);
+ err = fscrypt_set_context(inode, handle);
if (err)
goto fail_free_drop;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 153a9fbe1dd0..0d74615fcce3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -663,8 +663,7 @@ static struct stats dx_show_leaf(struct inode *dir,
/* Directory is encrypted */
res = fscrypt_fname_alloc_buffer(
- dir, len,
- &fname_crypto_str);
+ len, &fname_crypto_str);
if (res)
printk(KERN_WARNING "Error "
"allocating crypto "
@@ -1016,8 +1015,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
brelse(bh);
return err;
}
- err = fscrypt_fname_alloc_buffer(dir, EXT4_NAME_LEN,
- &fname_crypto_str);
+ err = fscrypt_fname_alloc_buffer(EXT4_NAME_LEN,
+ &fname_crypto_str);
if (err < 0) {
brelse(bh);
return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ea425b49b345..8b2736283481 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1104,7 +1104,7 @@ static void ext4_put_super(struct super_block *sb)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->s_blockgroup_lock);
fs_put_dax(sbi->s_daxdev);
- fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
#ifdef CONFIG_UNICODE
utf8_unload(sbi->s_encoding);
#endif
@@ -1392,10 +1392,9 @@ retry:
return res;
}
-static const union fscrypt_context *
-ext4_get_dummy_context(struct super_block *sb)
+static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb)
{
- return EXT4_SB(sb)->s_dummy_enc_ctx.ctx;
+ return EXT4_SB(sb)->s_dummy_enc_policy.policy;
}
static bool ext4_has_stable_inodes(struct super_block *sb)
@@ -1414,7 +1413,7 @@ static const struct fscrypt_operations ext4_cryptops = {
.key_prefix = "ext4:",
.get_context = ext4_get_context,
.set_context = ext4_set_context,
- .get_dummy_context = ext4_get_dummy_context,
+ .get_dummy_policy = ext4_get_dummy_policy,
.empty_dir = ext4_empty_dir,
.max_namelen = EXT4_NAME_LEN,
.has_stable_inodes = ext4_has_stable_inodes,
@@ -1888,12 +1887,13 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !sbi->s_dummy_enc_ctx.ctx) {
+ if (is_remount && !sbi->s_dummy_enc_policy.policy) {
ext4_msg(sb, KERN_WARNING,
"Can't set test_dummy_encryption on remount");
return -1;
}
- err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx);
+ err = fscrypt_set_test_dummy_encryption(sb, arg->from,
+ &sbi->s_dummy_enc_policy);
if (err) {
if (err == -EEXIST)
ext4_msg(sb, KERN_WARNING,
@@ -4935,7 +4935,7 @@ failed_mount:
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
#endif
- fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
ext4_blkdev_remove(sbi);
brelse(bh);
out_fail:
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ed2bca0fce92..73683e58a08d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -3550,6 +3550,9 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
unsigned long align = offset | iov_iter_alignment(iter);
struct block_device *bdev = inode->i_sb->s_bdev;
+ if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode))
+ return 1;
+
if (align & blocksize_mask) {
if (bdev)
blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 069f498af1e3..53fbc4dd6e48 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -111,7 +111,7 @@ static int __f2fs_setup_filename(const struct inode *dir,
#ifdef CONFIG_FS_ENCRYPTION
fname->crypto_buf = crypt_name->crypto_buf;
#endif
- if (crypt_name->is_ciphertext_name) {
+ if (crypt_name->is_nokey_name) {
/* hash was decoded from the no-key name */
fname->hash = cpu_to_le32(crypt_name->hash);
} else {
@@ -537,7 +537,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
goto put_error;
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_inherit_context(dir, inode, page, false);
+ err = fscrypt_set_context(inode, page);
if (err)
goto put_error;
}
@@ -1032,7 +1032,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
if (err)
goto out;
- err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(F2FS_NAME_LEN, &fstr);
if (err < 0)
goto out;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d9e52a7f3702..7c089ff7ff94 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -138,7 +138,7 @@ struct f2fs_mount_info {
int fsync_mode; /* fsync policy */
int fs_mode; /* fs mode: LFS or ADAPTIVE */
int bggc_mode; /* bggc mode: off, on or sync */
- struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */
+ struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */
block_t unusable_cap_perc; /* percentage for cap */
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
@@ -1315,13 +1315,6 @@ enum fsync_mode {
#define IS_IO_TRACED_PAGE(page) (0)
#endif
-#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) \
- (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL))
-#else
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
-#endif
-
/* For compression */
enum compress_algorithm_type {
COMPRESS_LZO,
@@ -4022,22 +4015,6 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
-static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode)
-{
-#ifdef CONFIG_FS_ENCRYPTION
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- umode_t mode = inode->i_mode;
-
- /*
- * If the directory encrypted or dummy encryption enabled,
- * then we should encrypt the inode.
- */
- if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi))
- return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
-#endif
- return false;
-}
-
static inline bool f2fs_may_compress(struct inode *inode)
{
if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) ||
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 84e4bbc1a64d..45f324511a19 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -28,6 +28,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_t ino;
struct inode *inode;
bool nid_free = false;
+ bool encrypt = false;
int xattr_size = 0;
int err;
@@ -69,13 +70,17 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns,
F2FS_DEF_PROJID);
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto fail_drop;
+
err = dquot_initialize(inode);
if (err)
goto fail_drop;
set_inode_flag(inode, FI_NEW_INODE);
- if (f2fs_may_encrypt(dir, inode))
+ if (encrypt)
f2fs_set_encrypted_inode(inode);
if (f2fs_sb_has_extra_attr(sbi)) {
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3ad7bdbda5ca..cb1b5b61a1da 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2373,6 +2373,9 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
if (unlikely(nid >= nm_i->max_nid))
nid = 0;
+ if (unlikely(nid % NAT_ENTRY_PER_BLOCK))
+ nid = NAT_BLOCK_OFFSET(nid) * NAT_ENTRY_PER_BLOCK;
+
/* Enough entries */
if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
return 0;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a65d357f89a9..e247a5ef3713 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -799,7 +799,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
if (__is_large_section(sbi)) {
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
- unsigned short valid_blocks =
+ block_t valid_blocks =
get_valid_blocks(sbi, segno, true);
f2fs_bug_on(sbi, unlikely(!valid_blocks ||
@@ -815,7 +815,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
enum dirty_type dirty_type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned short valid_blocks;
+ block_t valid_blocks;
if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
dirty_i->nr_dirty[dirty_type]--;
@@ -4316,8 +4316,8 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int segno = 0, offset = 0, secno;
- unsigned short valid_blocks;
- unsigned short blks_per_sec = BLKS_PER_SEC(sbi);
+ block_t valid_blocks;
+ block_t blks_per_sec = BLKS_PER_SEC(sbi);
while (1) {
/* find dirty segment based on free segmap */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index dfa072fa8081..bef2be3fa3d0 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -433,12 +433,12 @@ static int f2fs_set_test_dummy_encryption(struct super_block *sb,
* needed to allow it to be set or changed during remount. We do allow
* it to be specified during remount, but only if there is no change.
*/
- if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) {
+ if (is_remount && !F2FS_OPTION(sbi).dummy_enc_policy.policy) {
f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
return -EINVAL;
}
err = fscrypt_set_test_dummy_encryption(
- sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx);
+ sb, arg->from, &F2FS_OPTION(sbi).dummy_enc_policy);
if (err) {
if (err == -EEXIST)
f2fs_warn(sbi,
@@ -1275,7 +1275,7 @@ static void f2fs_put_super(struct super_block *sb)
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
destroy_percpu_info(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
kvfree(sbi->write_io[i]);
@@ -2482,10 +2482,9 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
ctx, len, fs_data, XATTR_CREATE);
}
-static const union fscrypt_context *
-f2fs_get_dummy_context(struct super_block *sb)
+static const union fscrypt_policy *f2fs_get_dummy_policy(struct super_block *sb)
{
- return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx;
+ return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_policy.policy;
}
static bool f2fs_has_stable_inodes(struct super_block *sb)
@@ -2523,7 +2522,7 @@ static const struct fscrypt_operations f2fs_cryptops = {
.key_prefix = "f2fs:",
.get_context = f2fs_get_context,
.set_context = f2fs_set_context,
- .get_dummy_context = f2fs_get_dummy_context,
+ .get_dummy_policy = f2fs_get_dummy_policy,
.empty_dir = f2fs_empty_dir,
.max_namelen = F2FS_NAME_LEN,
.has_stable_inodes = f2fs_has_stable_inodes,
@@ -3864,7 +3863,7 @@ free_options:
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx);
+ fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
kvfree(options);
free_sb_buf:
kfree(raw_super);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d4f84a2fe087..e6005c78bfa9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2184,7 +2184,7 @@ static int __init start_dirtytime_writeback(void)
__initcall(start_dirtytime_writeback);
int dirtytime_interval_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 6611ef3269a8..43c165e796da 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3091,11 +3091,10 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
- bool async_dio = ff->fc->async_dio;
loff_t pos = 0;
struct inode *inode;
loff_t i_size;
- size_t count = iov_iter_count(iter);
+ size_t count = iov_iter_count(iter), shortened = 0;
loff_t offset = iocb->ki_pos;
struct fuse_io_priv *io;
@@ -3103,17 +3102,9 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
inode = file->f_mapping->host;
i_size = i_size_read(inode);
- if ((iov_iter_rw(iter) == READ) && (offset > i_size))
+ if ((iov_iter_rw(iter) == READ) && (offset >= i_size))
return 0;
- /* optimization for short read */
- if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
- if (offset >= i_size)
- return 0;
- iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
- count = iov_iter_count(iter);
- }
-
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
if (!io)
return -ENOMEM;
@@ -3129,15 +3120,22 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* By default, we want to optimize all I/Os with async request
* submission to the client filesystem if supported.
*/
- io->async = async_dio;
+ io->async = ff->fc->async_dio;
io->iocb = iocb;
io->blocking = is_sync_kiocb(iocb);
+ /* optimization for short read */
+ if (io->async && !io->write && offset + count > i_size) {
+ iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
+ shortened = count - iov_iter_count(iter);
+ count -= shortened;
+ }
+
/*
* We cannot asynchronously extend the size of a file.
* In such case the aio will behave exactly like sync io.
*/
- if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
+ if ((offset + count > i_size) && io->write)
io->blocking = true;
if (io->async && io->blocking) {
@@ -3155,6 +3153,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else {
ret = __fuse_direct_read(io, iter, &pos);
}
+ iov_iter_reexpand(iter, iov_iter_count(iter) + shortened);
if (io->async) {
bool blocking = io->blocking;
diff --git a/fs/internal.h b/fs/internal.h
index 10517ece4516..a7cd0f64faa4 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -82,9 +82,6 @@ int may_linkat(struct path *link);
/*
* namespace.c
*/
-extern void *copy_mount_options(const void __user *);
-extern char *copy_mount_string(const void __user *);
-
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, struct path *);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce69bd9b0838..f58b3d6bba8a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1753,6 +1753,9 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb,
struct io_ring_ctx *ctx = req->ctx;
int ret, notify;
+ if (tsk->flags & PF_EXITING)
+ return -ESRCH;
+
/*
* SQPOLL kernel thread doesn't need notification, just a wakeup. For
* all other cases, use TWA_SIGNAL unconditionally to ensure we're
@@ -1787,8 +1790,10 @@ static void __io_req_task_cancel(struct io_kiocb *req, int error)
static void io_req_task_cancel(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
__io_req_task_cancel(req, -ECANCELED);
+ percpu_ref_put(&ctx->refs);
}
static void __io_req_task_submit(struct io_kiocb *req)
@@ -2010,6 +2015,12 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
static inline bool io_run_task_work(void)
{
+ /*
+ * Not safe to run on exiting task, and the task_work handling will
+ * not add work to such a task.
+ */
+ if (unlikely(current->flags & PF_EXITING))
+ return false;
if (current->task_works) {
__set_current_state(TASK_RUNNING);
task_work_run();
@@ -2283,13 +2294,17 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
goto end_req;
}
- ret = io_import_iovec(rw, req, &iovec, &iter, false);
- if (ret < 0)
- goto end_req;
- ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
- if (!ret)
+ if (!req->io) {
+ ret = io_import_iovec(rw, req, &iovec, &iter, false);
+ if (ret < 0)
+ goto end_req;
+ ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
+ if (!ret)
+ return true;
+ kfree(iovec);
+ } else {
return true;
- kfree(iovec);
+ }
end_req:
req_set_fail_links(req);
io_req_complete(req, ret);
@@ -2300,8 +2315,11 @@ end_req:
static bool io_rw_reissue(struct io_kiocb *req, long res)
{
#ifdef CONFIG_BLOCK
+ umode_t mode = file_inode(req->file)->i_mode;
int ret;
+ if (!S_ISBLK(mode) && !S_ISREG(mode))
+ return false;
if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
return false;
@@ -2834,13 +2852,8 @@ static ssize_t __io_import_iovec(int rw, struct io_kiocb *req,
return ret;
}
-#ifdef CONFIG_COMPAT
- if (req->ctx->compat)
- return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV,
- iovec, iter);
-#endif
-
- return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter);
+ return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter,
+ req->ctx->compat);
}
static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
@@ -2977,14 +2990,15 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
bool force_nonblock)
{
struct io_async_rw *iorw = &req->io->rw;
+ struct iovec *iov;
ssize_t ret;
- iorw->iter.iov = iorw->fast_iov;
- ret = __io_import_iovec(rw, req, (struct iovec **) &iorw->iter.iov,
- &iorw->iter, !force_nonblock);
+ iorw->iter.iov = iov = iorw->fast_iov;
+ ret = __io_import_iovec(rw, req, &iov, &iorw->iter, !force_nonblock);
if (unlikely(ret < 0))
return ret;
+ iorw->iter.iov = iov;
io_req_map_rw(req, iorw->iter.iov, iorw->fast_iov, &iorw->iter);
return 0;
}
@@ -3030,6 +3044,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
if (!wake_page_match(wpq, key))
return 0;
+ req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
list_del_init(&wait->entry);
init_task_work(&req->task_work, io_req_task_submit);
@@ -3087,6 +3102,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
wait->wait.flags = 0;
INIT_LIST_HEAD(&wait->wait.entry);
kiocb->ki_flags |= IOCB_WAITQ;
+ kiocb->ki_flags &= ~IOCB_NOWAIT;
kiocb->ki_waitq = wait;
io_get_req_task(req);
@@ -3111,6 +3127,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
struct iov_iter __iter, *iter = &__iter;
ssize_t io_size, ret, ret2;
size_t iov_count;
+ bool no_async;
if (req->io)
iter = &req->io->rw.iter;
@@ -3128,7 +3145,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
kiocb->ki_flags &= ~IOCB_NOWAIT;
/* If the file doesn't support async, just async punt */
- if (force_nonblock && !io_file_supports_async(req->file, READ))
+ no_async = force_nonblock && !io_file_supports_async(req->file, READ);
+ if (no_async)
goto copy_iov;
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), iov_count);
@@ -3146,12 +3164,13 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
goto done;
+ /* no retry on NONBLOCK marked file */
+ if (req->file->f_flags & O_NONBLOCK)
+ goto done;
/* some cases will consume bytes even on error returns */
iov_iter_revert(iter, iov_count - iov_iter_count(iter));
- ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
- if (ret)
- goto out_free;
- return -EAGAIN;
+ ret = 0;
+ goto copy_iov;
} else if (ret < 0) {
/* make sure -ERESTARTSYS -> -EINTR is done */
goto done;
@@ -3169,6 +3188,8 @@ copy_iov:
ret = ret2;
goto out_free;
}
+ if (no_async)
+ return -EAGAIN;
/* it's copied and will be cleaned with ->io */
iovec = NULL;
/* now use our persistent iterator, if we aren't already */
@@ -3287,10 +3308,14 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
*/
if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
ret2 = -EAGAIN;
+ /* no retry on NONBLOCK marked file */
+ if (ret2 == -EAGAIN && (req->file->f_flags & O_NONBLOCK))
+ goto done;
if (!force_nonblock || ret2 != -EAGAIN) {
/* IOPOLL retry should happen for io-wq threads */
if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN)
goto copy_iov;
+done:
kiocb_done(kiocb, ret2, cs);
} else {
copy_iov:
@@ -3497,8 +3522,6 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
const char __user *fname;
int ret;
- if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
- return -EINVAL;
if (unlikely(sqe->ioprio || sqe->buf_index))
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
@@ -3525,6 +3548,8 @@ static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
u64 flags, mode;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
mode = READ_ONCE(sqe->len);
@@ -3539,6 +3564,8 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
size_t len;
int ret;
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL)))
+ return -EINVAL;
if (req->flags & REQ_F_NEED_CLEANUP)
return 0;
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
@@ -3756,7 +3783,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
#if defined(CONFIG_EPOLL)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
@@ -3871,7 +3898,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
- if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+ if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL)))
return -EINVAL;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
@@ -4168,8 +4195,9 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
sr->len);
iomsg->iov = NULL;
} else {
- ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
- &iomsg->iov, &iomsg->msg.msg_iter);
+ ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
+ &iomsg->iov, &iomsg->msg.msg_iter,
+ false);
if (ret > 0)
ret = 0;
}
@@ -4209,9 +4237,9 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
sr->len = iomsg->iov[0].iov_len;
iomsg->iov = NULL;
} else {
- ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
- &iomsg->iov,
- &iomsg->msg.msg_iter);
+ ret = __import_iovec(READ, (struct iovec __user *)uiov, len,
+ UIO_FASTIOV, &iomsg->iov,
+ &iomsg->msg.msg_iter, true);
if (ret < 0)
return ret;
}
@@ -4713,6 +4741,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
if (mask && !(mask & poll->events))
return 0;
+ list_del_init(&wait->entry);
+
if (poll && poll->head) {
bool done;
@@ -5388,6 +5418,8 @@ static int io_async_cancel(struct io_kiocb *req)
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
+ if (unlikely(req->ctx->flags & IORING_SETUP_SQPOLL))
+ return -EINVAL;
if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT)))
return -EINVAL;
if (sqe->ioprio || sqe->rw_flags)
@@ -5438,6 +5470,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
if (unlikely(ret))
return ret;
+ io_prep_async_work(req);
+
switch (req->opcode) {
case IORING_OP_NOP:
break;
@@ -5635,6 +5669,11 @@ static void __io_clean_op(struct io_kiocb *req)
io_put_file(req, req->splice.file_in,
(req->splice.flags & SPLICE_F_FD_IN_FIXED));
break;
+ case IORING_OP_OPENAT:
+ case IORING_OP_OPENAT2:
+ if (req->open.filename)
+ putname(req->open.filename);
+ break;
}
req->flags &= ~REQ_F_NEED_CLEANUP;
}
@@ -6312,9 +6351,6 @@ static void io_submit_state_start(struct io_submit_state *state,
struct io_ring_ctx *ctx, unsigned int max_ios)
{
blk_start_plug(&state->plug);
-#ifdef CONFIG_BLOCK
- state->plug.nowait = true;
-#endif
state->comp.nr = 0;
INIT_LIST_HEAD(&state->comp.list);
state->comp.ctx = ctx;
@@ -7324,7 +7360,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
index = i & IORING_FILE_TABLE_MASK;
if (table->files[index]) {
- file = io_file_from_index(ctx, index);
+ file = table->files[index];
err = io_queue_file_removal(data, file);
if (err)
break;
@@ -7353,6 +7389,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
table->files[index] = file;
err = io_sqe_file_register(ctx, file, i);
if (err) {
+ table->files[index] = NULL;
fput(file);
break;
}
@@ -8012,6 +8049,28 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
return false;
}
+static inline bool io_match_files(struct io_kiocb *req,
+ struct files_struct *files)
+{
+ return (req->flags & REQ_F_WORK_INITIALIZED) && req->work.files == files;
+}
+
+static bool io_match_link_files(struct io_kiocb *req,
+ struct files_struct *files)
+{
+ struct io_kiocb *link;
+
+ if (io_match_files(req, files))
+ return true;
+ if (req->flags & REQ_F_LINK_HEAD) {
+ list_for_each_entry(link, &req->link_list, link_list) {
+ if (io_match_files(link, files))
+ return true;
+ }
+ }
+ return false;
+}
+
/*
* We're looking to cancel 'req' because it's holding on to our files, but
* 'req' could be a link to another request. See if it is, and cancel that
@@ -8086,12 +8145,38 @@ static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
io_timeout_remove_link(ctx, req);
}
+static void io_cancel_defer_files(struct io_ring_ctx *ctx,
+ struct files_struct *files)
+{
+ struct io_defer_entry *de = NULL;
+ LIST_HEAD(list);
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry_reverse(de, &ctx->defer_list, list) {
+ if (io_match_link_files(de->req, files)) {
+ list_cut_position(&list, &ctx->defer_list, &de->list);
+ break;
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+
+ while (!list_empty(&list)) {
+ de = list_first_entry(&list, struct io_defer_entry, list);
+ list_del_init(&de->list);
+ req_set_fail_links(de->req);
+ io_put_req(de->req);
+ io_req_complete(de->req, -ECANCELED);
+ kfree(de);
+ }
+}
+
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
if (list_empty_careful(&ctx->inflight_list))
return;
+ io_cancel_defer_files(ctx, files);
/* cancel all at once, should be faster than doing it one by one*/
io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true);
@@ -8120,6 +8205,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
/* cancel this request, or head link requests */
io_attempt_cancel(ctx, cancel_req);
io_put_req(cancel_req);
+ /* cancellations _may_ trigger task work */
+ io_run_task_work();
schedule();
finish_wait(&ctx->inflight_wait, &wait);
}
@@ -8325,11 +8412,19 @@ static int io_uring_show_cred(int id, void *p, void *data)
static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
{
+ bool has_lock;
int i;
- mutex_lock(&ctx->uring_lock);
+ /*
+ * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
+ * since fdinfo case grabs it in the opposite direction of normal use
+ * cases. If we fail to get the lock, we just don't iterate any
+ * structures that could be going away outside the io_uring mutex.
+ */
+ has_lock = mutex_trylock(&ctx->uring_lock);
+
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
- for (i = 0; i < ctx->nr_user_files; i++) {
+ for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct fixed_file_table *table;
struct file *f;
@@ -8341,13 +8436,13 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
seq_printf(m, "%5u: <none>\n", i);
}
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
- for (i = 0; i < ctx->nr_user_bufs; i++) {
+ for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
(unsigned int) buf->len);
}
- if (!idr_is_empty(&ctx->personality_idr)) {
+ if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
seq_printf(m, "Personalities:\n");
idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
}
@@ -8362,7 +8457,8 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
req->task->task_works != NULL);
}
spin_unlock_irq(&ctx->completion_lock);
- mutex_unlock(&ctx->uring_lock);
+ if (has_lock)
+ mutex_unlock(&ctx->uring_lock);
}
static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
diff --git a/fs/namespace.c b/fs/namespace.c
index bae0e95b3713..294e05a13d17 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3072,10 +3072,10 @@ static void shrink_submounts(struct mount *mnt)
}
}
-void *copy_mount_options(const void __user * data)
+static void *copy_mount_options(const void __user * data)
{
char *copy;
- unsigned size;
+ unsigned left, offset;
if (!data)
return NULL;
@@ -3084,20 +3084,31 @@ void *copy_mount_options(const void __user * data)
if (!copy)
return ERR_PTR(-ENOMEM);
- size = PAGE_SIZE - offset_in_page(data);
+ left = copy_from_user(copy, data, PAGE_SIZE);
- if (copy_from_user(copy, data, size)) {
+ /*
+ * Not all architectures have an exact copy_from_user(). Resort to
+ * byte at a time.
+ */
+ offset = PAGE_SIZE - left;
+ while (left) {
+ char c;
+ if (get_user(c, (const char __user *)data + offset))
+ break;
+ copy[offset] = c;
+ left--;
+ offset++;
+ }
+
+ if (left == PAGE_SIZE) {
kfree(copy);
return ERR_PTR(-EFAULT);
}
- if (size != PAGE_SIZE) {
- if (copy_from_user(copy + size, data + size, PAGE_SIZE - size))
- memset(copy + size, 0, PAGE_SIZE - size);
- }
+
return copy;
}
-char *copy_mount_string(const void __user *data)
+static char *copy_mount_string(const void __user *data)
{
return data ? strndup_user(data, PATH_MAX) : NULL;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e732580fe47b..cb52db9a0cfb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -579,6 +579,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
do {
+ if (entry->label)
+ entry->label->len = NFS4_MAXLABELLEN;
+
status = xdr_decode(desc, entry, &stream);
if (status != 0) {
if (status == -EAGAIN)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ff8965d1a4d4..a163533446fa 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -715,7 +715,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
}
static void
-ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -724,7 +724,7 @@ ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
}
static void
-ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
+ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -734,14 +734,14 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, int idx)
static struct nfs4_pnfs_ds *
ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx,
+ u32 start_idx, u32 *best_idx,
bool check_device)
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
bool fail_return = false;
- int idx;
+ u32 idx;
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
@@ -766,21 +766,21 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
static struct nfs4_pnfs_ds *
ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true);
}
static struct nfs4_pnfs_ds *
ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
- int start_idx, int *best_idx)
+ u32 start_idx, u32 *best_idx)
{
struct nfs4_pnfs_ds *ds;
@@ -791,7 +791,8 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
}
static struct nfs4_pnfs_ds *
-ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, int *best_idx)
+ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ u32 *best_idx)
{
struct pnfs_layout_segment *lseg = pgio->pg_lseg;
struct nfs4_pnfs_ds *ds;
@@ -837,7 +838,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_pgio_mirror *pgm;
struct nfs4_ff_layout_mirror *mirror;
struct nfs4_pnfs_ds *ds;
- int ds_idx;
+ u32 ds_idx, i;
retry:
ff_layout_pg_check_layout(pgio, req);
@@ -863,14 +864,14 @@ retry:
goto retry;
}
- mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+ for (i = 0; i < pgio->pg_mirror_count; i++) {
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ pgm = &pgio->pg_mirrors[i];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+ }
pgio->pg_mirror_idx = ds_idx;
- /* read always uses only one mirror - idx 0 for pgio layer */
- pgm = &pgio->pg_mirrors[0];
- pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
-
if (NFS_SERVER(pgio->pg_inode)->flags &
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
pgio->pg_maxretrans = io_maxretrans;
@@ -894,7 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
struct nfs4_pnfs_ds *ds;
- int i;
+ u32 i;
retry:
ff_layout_pg_check_layout(pgio, req);
@@ -1038,7 +1039,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr)
{
u32 idx = hdr->pgio_mirror_idx + 1;
- int new_idx = 0;
+ u32 new_idx = 0;
if (ff_layout_choose_any_ds_for_read(hdr->lseg, idx + 1, &new_idx))
ff_layout_send_layouterror(hdr->lseg);
@@ -1075,7 +1076,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
struct pnfs_layout_hdr *lo = lseg->pls_layout;
struct inode *inode = lo->plh_inode;
@@ -1149,7 +1150,7 @@ reset:
/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
static int ff_layout_async_handle_error_v3(struct rpc_task *task,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
@@ -1184,7 +1185,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
- int idx)
+ u32 idx)
{
int vers = clp->cl_nfs_mod->rpc_vers->number;
@@ -1211,7 +1212,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
}
static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
- int idx, u64 offset, u64 length,
+ u32 idx, u64 offset, u64 length,
u32 *op_status, int opnum, int error)
{
struct nfs4_ff_layout_mirror *mirror;
@@ -1809,7 +1810,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
loff_t offset = hdr->args.offset;
int vers;
struct nfs_fh *fh;
- int idx = hdr->pgio_mirror_idx;
+ u32 idx = hdr->pgio_mirror_idx;
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 524812984e2d..222afba70bc0 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -1039,6 +1039,65 @@ out_invalid_fh:
}
#if IS_ENABLED(CONFIG_NFS_V4)
+struct compat_nfs_string {
+ compat_uint_t len;
+ compat_uptr_t data;
+};
+
+static inline void compat_nfs_string(struct nfs_string *dst,
+ struct compat_nfs_string *src)
+{
+ dst->data = compat_ptr(src->data);
+ dst->len = src->len;
+}
+
+struct compat_nfs4_mount_data_v1 {
+ compat_int_t version;
+ compat_int_t flags;
+ compat_int_t rsize;
+ compat_int_t wsize;
+ compat_int_t timeo;
+ compat_int_t retrans;
+ compat_int_t acregmin;
+ compat_int_t acregmax;
+ compat_int_t acdirmin;
+ compat_int_t acdirmax;
+ struct compat_nfs_string client_addr;
+ struct compat_nfs_string mnt_path;
+ struct compat_nfs_string hostname;
+ compat_uint_t host_addrlen;
+ compat_uptr_t host_addr;
+ compat_int_t proto;
+ compat_int_t auth_flavourlen;
+ compat_uptr_t auth_flavours;
+};
+
+static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data)
+{
+ struct compat_nfs4_mount_data_v1 *compat =
+ (struct compat_nfs4_mount_data_v1 *)data;
+
+ /* copy the fields backwards */
+ data->auth_flavours = compat_ptr(compat->auth_flavours);
+ data->auth_flavourlen = compat->auth_flavourlen;
+ data->proto = compat->proto;
+ data->host_addr = compat_ptr(compat->host_addr);
+ data->host_addrlen = compat->host_addrlen;
+ compat_nfs_string(&data->hostname, &compat->hostname);
+ compat_nfs_string(&data->mnt_path, &compat->mnt_path);
+ compat_nfs_string(&data->client_addr, &compat->client_addr);
+ data->acdirmax = compat->acdirmax;
+ data->acdirmin = compat->acdirmin;
+ data->acregmax = compat->acregmax;
+ data->acregmin = compat->acregmin;
+ data->retrans = compat->retrans;
+ data->timeo = compat->timeo;
+ data->wsize = compat->wsize;
+ data->rsize = compat->rsize;
+ data->flags = compat->flags;
+ data->version = compat->version;
+}
+
/*
* Validate NFSv4 mount options
*/
@@ -1049,89 +1108,83 @@ static int nfs4_parse_monolithic(struct fs_context *fc,
struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address;
char *c;
- if (data == NULL)
- goto out_no_data;
+ if (!data) {
+ if (is_remount_fc(fc))
+ goto done;
+ return nfs_invalf(fc,
+ "NFS4: mount program didn't pass any mount data");
+ }
ctx->version = 4;
- switch (data->version) {
- case 1:
- if (data->host_addrlen > sizeof(ctx->nfs_server.address))
- goto out_no_address;
- if (data->host_addrlen == 0)
- goto out_no_address;
- ctx->nfs_server.addrlen = data->host_addrlen;
- if (copy_from_user(sap, data->host_addr, data->host_addrlen))
- return -EFAULT;
- if (!nfs_verify_server_address(sap))
- goto out_no_address;
- ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
-
- if (data->auth_flavourlen) {
- rpc_authflavor_t pseudoflavor;
- if (data->auth_flavourlen > 1)
- goto out_inval_auth;
- if (copy_from_user(&pseudoflavor,
- data->auth_flavours,
- sizeof(pseudoflavor)))
- return -EFAULT;
- ctx->selected_flavor = pseudoflavor;
- } else
- ctx->selected_flavor = RPC_AUTH_UNIX;
-
- c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->nfs_server.hostname = c;
+ if (data->version != 1)
+ return generic_parse_monolithic(fc, data);
- c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->nfs_server.export_path = c;
- dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+ if (in_compat_syscall())
+ nfs4_compat_mount_data_conv(data);
- c = strndup_user(data->client_addr.data, 16);
- if (IS_ERR(c))
- return PTR_ERR(c);
- ctx->client_address = c;
-
- /*
- * Translate to nfs_fs_context, which nfs_fill_super
- * can deal with.
- */
+ if (data->host_addrlen > sizeof(ctx->nfs_server.address))
+ goto out_no_address;
+ if (data->host_addrlen == 0)
+ goto out_no_address;
+ ctx->nfs_server.addrlen = data->host_addrlen;
+ if (copy_from_user(sap, data->host_addr, data->host_addrlen))
+ return -EFAULT;
+ if (!nfs_verify_server_address(sap))
+ goto out_no_address;
+ ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port);
- ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK;
- ctx->rsize = data->rsize;
- ctx->wsize = data->wsize;
- ctx->timeo = data->timeo;
- ctx->retrans = data->retrans;
- ctx->acregmin = data->acregmin;
- ctx->acregmax = data->acregmax;
- ctx->acdirmin = data->acdirmin;
- ctx->acdirmax = data->acdirmax;
- ctx->nfs_server.protocol = data->proto;
- nfs_validate_transport_protocol(ctx);
- if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
- goto out_invalid_transport_udp;
+ if (data->auth_flavourlen) {
+ rpc_authflavor_t pseudoflavor;
- break;
- default:
- goto generic;
+ if (data->auth_flavourlen > 1)
+ goto out_inval_auth;
+ if (copy_from_user(&pseudoflavor, data->auth_flavours,
+ sizeof(pseudoflavor)))
+ return -EFAULT;
+ ctx->selected_flavor = pseudoflavor;
+ } else {
+ ctx->selected_flavor = RPC_AUTH_UNIX;
}
+ c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.hostname = c;
+
+ c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->nfs_server.export_path = c;
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c);
+
+ c = strndup_user(data->client_addr.data, 16);
+ if (IS_ERR(c))
+ return PTR_ERR(c);
+ ctx->client_address = c;
+
+ /*
+ * Translate to nfs_fs_context, which nfs_fill_super
+ * can deal with.
+ */
+
+ ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK;
+ ctx->rsize = data->rsize;
+ ctx->wsize = data->wsize;
+ ctx->timeo = data->timeo;
+ ctx->retrans = data->retrans;
+ ctx->acregmin = data->acregmin;
+ ctx->acregmax = data->acregmax;
+ ctx->acdirmin = data->acdirmin;
+ ctx->acdirmax = data->acdirmax;
+ ctx->nfs_server.protocol = data->proto;
+ nfs_validate_transport_protocol(ctx);
+ if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP)
+ goto out_invalid_transport_udp;
+done:
ctx->skip_reconfig_option_check = true;
return 0;
-generic:
- return generic_parse_monolithic(fc, data);
-
-out_no_data:
- if (is_remount_fc(fc)) {
- ctx->skip_reconfig_option_check = true;
- return 0;
- }
- return nfs_invalf(fc, "NFS4: mount program didn't pass any mount data");
-
out_inval_auth:
return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d",
data->auth_flavourlen);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 142225f0af59..2b2211d1234e 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -356,7 +356,15 @@ static ssize_t _nfs42_proc_copy(struct file *src,
truncate_pagecache_range(dst_inode, pos_dst,
pos_dst + res->write_res.count);
-
+ spin_lock(&dst_inode->i_lock);
+ NFS_I(dst_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_SIZE |
+ NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA);
+ spin_unlock(&dst_inode->i_lock);
+ spin_lock(&src_inode->i_lock);
+ NFS_I(src_inode)->cache_validity |= (NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_REVAL_FORCED | NFS_INO_INVALID_ATIME);
+ spin_unlock(&src_inode->i_lock);
status = res->write_res.count;
out:
if (args->sync)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f8946b9468ef..6e95c85fe395 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3293,8 +3293,10 @@ static int _nfs4_do_setattr(struct inode *inode,
/* Servers should only apply open mode checks for file size changes */
truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
- if (!truncate)
+ if (!truncate) {
+ nfs4_inode_make_writeable(inode);
goto zero_stateid;
+ }
if (nfs4_copy_delegation_stateid(inode, FMODE_WRITE, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
@@ -7298,7 +7300,12 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
err = nfs4_set_lock_state(state, fl);
if (err != 0)
return err;
- err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+ do {
+ err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
+ if (err != -NFS4ERR_DELAY)
+ break;
+ ssleep(1);
+ } while (err == -NFS4ERR_DELAY);
return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err);
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 60dbee457143..0ac197658a2d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -106,25 +106,6 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
}
}
-/* Drop the inode semaphore and wait for a pipe event, atomically */
-void pipe_wait(struct pipe_inode_info *pipe)
-{
- DEFINE_WAIT(rdwait);
- DEFINE_WAIT(wrwait);
-
- /*
- * Pipes are system-local resources, so sleeping on them
- * is considered a noninteractive wait:
- */
- prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
- prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
- pipe_unlock(pipe);
- schedule();
- finish_wait(&pipe->rd_wait, &rdwait);
- finish_wait(&pipe->wr_wait, &wrwait);
- pipe_lock(pipe);
-}
-
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -913,19 +894,18 @@ int create_pipe_files(struct file **res, int flags)
{
struct inode *inode = get_pipe_inode();
struct file *f;
+ int error;
if (!inode)
return -ENFILE;
if (flags & O_NOTIFICATION_PIPE) {
-#ifdef CONFIG_WATCH_QUEUE
- if (watch_queue_init(inode->i_pipe) < 0) {
+ error = watch_queue_init(inode->i_pipe);
+ if (error) {
+ free_pipe_info(inode->i_pipe);
iput(inode);
- return -ENOMEM;
+ return error;
}
-#else
- return -ENOPKG;
-#endif
}
f = alloc_file_pseudo(inode, pipe_mnt, "",
@@ -1035,12 +1015,52 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
return do_pipe2(fildes, 0);
}
+/*
+ * This is the stupid "wait for pipe to be readable or writable"
+ * model.
+ *
+ * See pipe_read/write() for the proper kind of exclusive wait,
+ * but that requires that we wake up any other readers/writers
+ * if we then do not end up reading everything (ie the whole
+ * "wake_next_reader/writer" logic in pipe_read/write()).
+ */
+void pipe_wait_readable(struct pipe_inode_info *pipe)
+{
+ pipe_unlock(pipe);
+ wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
+ pipe_lock(pipe);
+}
+
+void pipe_wait_writable(struct pipe_inode_info *pipe)
+{
+ pipe_unlock(pipe);
+ wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
+ pipe_lock(pipe);
+}
+
+/*
+ * This depends on both the wait (here) and the wakeup (wake_up_partner)
+ * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
+ * race with the count check and waitqueue prep.
+ *
+ * Normally in order to avoid races, you'd do the prepare_to_wait() first,
+ * then check the condition you're waiting for, and only then sleep. But
+ * because of the pipe lock, we can check the condition before being on
+ * the wait queue.
+ *
+ * We use the 'rd_wait' waitqueue for pipe partner waiting.
+ */
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
+ DEFINE_WAIT(rdwait);
int cur = *cnt;
while (cur == *cnt) {
- pipe_wait(pipe);
+ prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
+ pipe_unlock(pipe);
+ schedule();
+ finish_wait(&pipe->rd_wait, &rdwait);
+ pipe_lock(pipe);
if (signal_pending(current))
break;
}
@@ -1050,7 +1070,6 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
static void wake_up_partner(struct pipe_inode_info *pipe)
{
wake_up_interruptible_all(&pipe->rd_wait);
- wake_up_interruptible_all(&pipe->wr_wait);
}
static int fifo_open(struct inode *inode, struct file *filp)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index f909243d4a66..9f1077d94cde 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -217,6 +217,9 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
+#ifdef CONFIG_64BIT
+ u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
return u;
};
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5066b0251ed8..35172a91148e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -653,6 +653,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)] = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
+#ifdef CONFIG_ARM64_MTE
+ [ilog2(VM_MTE)] = "mt",
+ [ilog2(VM_MTE_ALLOWED)] = "",
+#endif
#ifdef CONFIG_ARCH_HAS_PKEYS
/* These come out via ProtectionKey: */
[ilog2(VM_PKEY_BIT0)] = "",
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index d1ceb76adb71..b59cd172b5f9 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -70,8 +70,3 @@ config QFMT_V2
config QUOTACTL
bool
default n
-
-config QUOTACTL_COMPAT
- bool
- depends on QUOTACTL && COMPAT_FOR_U64_ALIGNMENT
- default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
index f2b49d0f0287..9160639daffa 100644
--- a/fs/quota/Makefile
+++ b/fs/quota/Makefile
@@ -4,5 +4,4 @@ obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
obj-$(CONFIG_QUOTACTL) += quota.o kqid.o
-obj-$(CONFIG_QUOTACTL_COMPAT) += compat.o
obj-$(CONFIG_QUOTA_NETLINK_INTERFACE) += netlink.o
diff --git a/fs/quota/compat.c b/fs/quota/compat.c
deleted file mode 100644
index c30572857619..000000000000
--- a/fs/quota/compat.c
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/syscalls.h>
-#include <linux/compat.h>
-#include <linux/quotaops.h>
-
-/*
- * This code works only for 32 bit quota tools over 64 bit OS (x86_64, ia64)
- * and is necessary due to alignment problems.
- */
-struct compat_if_dqblk {
- compat_u64 dqb_bhardlimit;
- compat_u64 dqb_bsoftlimit;
- compat_u64 dqb_curspace;
- compat_u64 dqb_ihardlimit;
- compat_u64 dqb_isoftlimit;
- compat_u64 dqb_curinodes;
- compat_u64 dqb_btime;
- compat_u64 dqb_itime;
- compat_uint_t dqb_valid;
-};
-
-/* XFS structures */
-struct compat_fs_qfilestat {
- compat_u64 dqb_bhardlimit;
- compat_u64 qfs_nblks;
- compat_uint_t qfs_nextents;
-};
-
-struct compat_fs_quota_stat {
- __s8 qs_version;
- __u16 qs_flags;
- __s8 qs_pad;
- struct compat_fs_qfilestat qs_uquota;
- struct compat_fs_qfilestat qs_gquota;
- compat_uint_t qs_incoredqs;
- compat_int_t qs_btimelimit;
- compat_int_t qs_itimelimit;
- compat_int_t qs_rtbtimelimit;
- __u16 qs_bwarnlimit;
- __u16 qs_iwarnlimit;
-};
-
-COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd,
- const char __user *, special, qid_t, id,
- void __user *, addr)
-{
- unsigned int cmds;
- struct if_dqblk __user *dqblk;
- struct compat_if_dqblk __user *compat_dqblk;
- struct fs_quota_stat __user *fsqstat;
- struct compat_fs_quota_stat __user *compat_fsqstat;
- compat_uint_t data;
- u16 xdata;
- long ret;
-
- cmds = cmd >> SUBCMDSHIFT;
-
- switch (cmds) {
- case Q_GETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = kernel_quotactl(cmd, special, id, dqblk);
- if (ret)
- break;
- if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &dqblk->dqb_valid) ||
- put_user(data, &compat_dqblk->dqb_valid))
- ret = -EFAULT;
- break;
- case Q_SETQUOTA:
- dqblk = compat_alloc_user_space(sizeof(struct if_dqblk));
- compat_dqblk = addr;
- ret = -EFAULT;
- if (copy_in_user(dqblk, compat_dqblk, sizeof(*compat_dqblk)) ||
- get_user(data, &compat_dqblk->dqb_valid) ||
- put_user(data, &dqblk->dqb_valid))
- break;
- ret = kernel_quotactl(cmd, special, id, dqblk);
- break;
- case Q_XGETQSTAT:
- fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat));
- compat_fsqstat = addr;
- ret = kernel_quotactl(cmd, special, id, fsqstat);
- if (ret)
- break;
- ret = -EFAULT;
- /* Copying qs_version, qs_flags, qs_pad */
- if (copy_in_user(compat_fsqstat, fsqstat,
- offsetof(struct compat_fs_quota_stat, qs_uquota)))
- break;
- /* Copying qs_uquota */
- if (copy_in_user(&compat_fsqstat->qs_uquota,
- &fsqstat->qs_uquota,
- sizeof(compat_fsqstat->qs_uquota)) ||
- get_user(data, &fsqstat->qs_uquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_uquota.qfs_nextents))
- break;
- /* Copying qs_gquota */
- if (copy_in_user(&compat_fsqstat->qs_gquota,
- &fsqstat->qs_gquota,
- sizeof(compat_fsqstat->qs_gquota)) ||
- get_user(data, &fsqstat->qs_gquota.qfs_nextents) ||
- put_user(data, &compat_fsqstat->qs_gquota.qfs_nextents))
- break;
- /* Copying the rest */
- if (copy_in_user(&compat_fsqstat->qs_incoredqs,
- &fsqstat->qs_incoredqs,
- sizeof(struct compat_fs_quota_stat) -
- offsetof(struct compat_fs_quota_stat, qs_incoredqs)) ||
- get_user(xdata, &fsqstat->qs_iwarnlimit) ||
- put_user(xdata, &compat_fsqstat->qs_iwarnlimit))
- break;
- ret = 0;
- break;
- default:
- ret = kernel_quotactl(cmd, special, id, addr);
- }
- return ret;
-}
diff --git a/fs/quota/compat.h b/fs/quota/compat.h
new file mode 100644
index 000000000000..ef7d1e12d650
--- /dev/null
+++ b/fs/quota/compat.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/compat.h>
+
+struct compat_if_dqblk {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 dqb_bsoftlimit;
+ compat_u64 dqb_curspace;
+ compat_u64 dqb_ihardlimit;
+ compat_u64 dqb_isoftlimit;
+ compat_u64 dqb_curinodes;
+ compat_u64 dqb_btime;
+ compat_u64 dqb_itime;
+ compat_uint_t dqb_valid;
+};
+
+struct compat_fs_qfilestat {
+ compat_u64 dqb_bhardlimit;
+ compat_u64 qfs_nblks;
+ compat_uint_t qfs_nextents;
+};
+
+struct compat_fs_quota_stat {
+ __s8 qs_version;
+ __u16 qs_flags;
+ __s8 qs_pad;
+ struct compat_fs_qfilestat qs_uquota;
+ struct compat_fs_qfilestat qs_gquota;
+ compat_uint_t qs_incoredqs;
+ compat_int_t qs_btimelimit;
+ compat_int_t qs_itimelimit;
+ compat_int_t qs_rtbtimelimit;
+ __u16 qs_bwarnlimit;
+ __u16 qs_iwarnlimit;
+};
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 47f9e151988b..6b37d58f1067 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -19,6 +19,7 @@
#include <linux/types.h>
#include <linux/writeback.h>
#include <linux/nospec.h>
+#include "compat.h"
static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
qid_t id)
@@ -211,8 +212,18 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
if (ret)
return ret;
copy_to_if_dqblk(&idq, &fdq);
- if (copy_to_user(addr, &idq, sizeof(idq)))
- return -EFAULT;
+
+ if (compat_need_64bit_alignment_fixup()) {
+ struct compat_if_dqblk __user *compat_dqblk = addr;
+
+ if (copy_to_user(compat_dqblk, &idq, sizeof(*compat_dqblk)))
+ return -EFAULT;
+ if (put_user(idq.dqb_valid, &compat_dqblk->dqb_valid))
+ return -EFAULT;
+ } else {
+ if (copy_to_user(addr, &idq, sizeof(idq)))
+ return -EFAULT;
+ }
return 0;
}
@@ -277,8 +288,16 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
struct if_dqblk idq;
struct kqid qid;
- if (copy_from_user(&idq, addr, sizeof(idq)))
- return -EFAULT;
+ if (compat_need_64bit_alignment_fixup()) {
+ struct compat_if_dqblk __user *compat_dqblk = addr;
+
+ if (copy_from_user(&idq, compat_dqblk, sizeof(*compat_dqblk)) ||
+ get_user(idq.dqb_valid, &compat_dqblk->dqb_valid))
+ return -EFAULT;
+ } else {
+ if (copy_from_user(&idq, addr, sizeof(idq)))
+ return -EFAULT;
+ }
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
@@ -382,6 +401,33 @@ static int quota_getstate(struct super_block *sb, int type,
return 0;
}
+static int compat_copy_fs_qfilestat(struct compat_fs_qfilestat __user *to,
+ struct fs_qfilestat *from)
+{
+ if (copy_to_user(to, from, sizeof(*to)) ||
+ put_user(from->qfs_nextents, &to->qfs_nextents))
+ return -EFAULT;
+ return 0;
+}
+
+static int compat_copy_fs_quota_stat(struct compat_fs_quota_stat __user *to,
+ struct fs_quota_stat *from)
+{
+ if (put_user(from->qs_version, &to->qs_version) ||
+ put_user(from->qs_flags, &to->qs_flags) ||
+ put_user(from->qs_pad, &to->qs_pad) ||
+ compat_copy_fs_qfilestat(&to->qs_uquota, &from->qs_uquota) ||
+ compat_copy_fs_qfilestat(&to->qs_gquota, &from->qs_gquota) ||
+ put_user(from->qs_incoredqs, &to->qs_incoredqs) ||
+ put_user(from->qs_btimelimit, &to->qs_btimelimit) ||
+ put_user(from->qs_itimelimit, &to->qs_itimelimit) ||
+ put_user(from->qs_rtbtimelimit, &to->qs_rtbtimelimit) ||
+ put_user(from->qs_bwarnlimit, &to->qs_bwarnlimit) ||
+ put_user(from->qs_iwarnlimit, &to->qs_iwarnlimit))
+ return -EFAULT;
+ return 0;
+}
+
static int quota_getxstate(struct super_block *sb, int type, void __user *addr)
{
struct fs_quota_stat fqs;
@@ -390,9 +436,14 @@ static int quota_getxstate(struct super_block *sb, int type, void __user *addr)
if (!sb->s_qcop->get_state)
return -ENOSYS;
ret = quota_getstate(sb, type, &fqs);
- if (!ret && copy_to_user(addr, &fqs, sizeof(fqs)))
+ if (ret)
+ return ret;
+
+ if (compat_need_64bit_alignment_fixup())
+ return compat_copy_fs_quota_stat(addr, &fqs);
+ if (copy_to_user(addr, &fqs, sizeof(fqs)))
return -EFAULT;
- return ret;
+ return 0;
}
static int quota_getstatev(struct super_block *sb, int type,
@@ -816,8 +867,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
* calls. Maybe we need to add the process quotas etc. in the future,
* but we probably should use rlimits for that.
*/
-int kernel_quotactl(unsigned int cmd, const char __user *special,
- qid_t id, void __user *addr)
+SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
+ qid_t, id, void __user *, addr)
{
uint cmds, type;
struct super_block *sb = NULL;
@@ -871,9 +922,3 @@ out:
path_put(pathp);
return ret;
}
-
-SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
- qid_t, id, void __user *, addr)
-{
- return kernel_quotactl(cmd, special, id, addr);
-}
diff --git a/fs/read_write.c b/fs/read_write.c
index 5db58b8c78d0..19f5c4bf75aa 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -538,6 +538,14 @@ ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t
inc_syscw(current);
return ret;
}
+/*
+ * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
+ * but autofs is one of the few internal kernel users that actually
+ * wants this _and_ can be built as a module. So we need to export
+ * this symbol for autofs, even though it really isn't appropriate
+ * for any other kernel modules.
+ */
+EXPORT_SYMBOL_GPL(__kernel_write);
ssize_t kernel_write(struct file *file, const void *buf, size_t count,
loff_t *pos)
@@ -752,185 +760,6 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
return ret;
}
-/**
- * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace
- * into the kernel and check that it is valid.
- *
- * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE.
- * @uvector: Pointer to the userspace array.
- * @nr_segs: Number of elements in userspace array.
- * @fast_segs: Number of elements in @fast_pointer.
- * @fast_pointer: Pointer to (usually small on-stack) kernel array.
- * @ret_pointer: (output parameter) Pointer to a variable that will point to
- * either @fast_pointer, a newly allocated kernel array, or NULL,
- * depending on which array was used.
- *
- * This function copies an array of &struct iovec of @nr_segs from
- * userspace into the kernel and checks that each element is valid (e.g.
- * it does not point to a kernel address or cause overflow by being too
- * large, etc.).
- *
- * As an optimization, the caller may provide a pointer to a small
- * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long
- * (the size of this array, or 0 if unused, should be given in @fast_segs).
- *
- * @ret_pointer will always point to the array that was used, so the
- * caller must take care not to call kfree() on it e.g. in case the
- * @fast_pointer array was used and it was allocated on the stack.
- *
- * Return: The total number of bytes covered by the iovec array on success
- * or a negative error code on error.
- */
-ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
- unsigned long nr_segs, unsigned long fast_segs,
- struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- unsigned long seg;
- ssize_t ret;
- struct iovec *iov = fast_pointer;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0) {
- ret = 0;
- goto out;
- }
-
- /*
- * First get the "struct iovec" from user memory and
- * verify all the pointers
- */
- if (nr_segs > UIO_MAXIOV) {
- ret = -EINVAL;
- goto out;
- }
- if (nr_segs > fast_segs) {
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- }
- if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * According to the Single Unix Specification we should return EINVAL
- * if an element length is < 0 when cast to ssize_t or if the
- * total length would overflow the ssize_t return value of the
- * system call.
- *
- * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
- * overflow case.
- */
- ret = 0;
- for (seg = 0; seg < nr_segs; seg++) {
- void __user *buf = iov[seg].iov_base;
- ssize_t len = (ssize_t)iov[seg].iov_len;
-
- /* see if we we're about to use an invalid len or if
- * it's about to overflow ssize_t */
- if (len < 0) {
- ret = -EINVAL;
- goto out;
- }
- if (type >= 0
- && unlikely(!access_ok(buf, len))) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - ret) {
- len = MAX_RW_COUNT - ret;
- iov[seg].iov_len = len;
- }
- ret += len;
- }
-out:
- *ret_pointer = iov;
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-ssize_t compat_rw_copy_check_uvector(int type,
- const struct compat_iovec __user *uvector, unsigned long nr_segs,
- unsigned long fast_segs, struct iovec *fast_pointer,
- struct iovec **ret_pointer)
-{
- compat_ssize_t tot_len;
- struct iovec *iov = *ret_pointer = fast_pointer;
- ssize_t ret = 0;
- int seg;
-
- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- if (nr_segs == 0)
- goto out;
-
- ret = -EINVAL;
- if (nr_segs > UIO_MAXIOV)
- goto out;
- if (nr_segs > fast_segs) {
- ret = -ENOMEM;
- iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
- if (iov == NULL)
- goto out;
- }
- *ret_pointer = iov;
-
- ret = -EFAULT;
- if (!access_ok(uvector, nr_segs*sizeof(*uvector)))
- goto out;
-
- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t.
- *
- * In Linux, the total length is limited to MAX_RW_COUNT, there is
- * no overflow possibility.
- */
- tot_len = 0;
- ret = -EINVAL;
- for (seg = 0; seg < nr_segs; seg++) {
- compat_uptr_t buf;
- compat_ssize_t len;
-
- if (__get_user(len, &uvector->iov_len) ||
- __get_user(buf, &uvector->iov_base)) {
- ret = -EFAULT;
- goto out;
- }
- if (len < 0) /* size_t not fitting in compat_ssize_t .. */
- goto out;
- if (type >= 0 &&
- !access_ok(compat_ptr(buf), len)) {
- ret = -EFAULT;
- goto out;
- }
- if (len > MAX_RW_COUNT - tot_len)
- len = MAX_RW_COUNT - tot_len;
- tot_len += len;
- iov->iov_base = compat_ptr(buf);
- iov->iov_len = (compat_size_t) len;
- uvector++;
- iov++;
- }
- ret = tot_len;
-
-out:
- return ret;
-}
-#endif
-
static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
loff_t *pos, rwf_t flags)
{
@@ -1247,224 +1076,93 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
return do_pwritev(fd, vec, vlen, pos, flags);
}
+/*
+ * Various compat syscalls. Note that they all pretend to take a native
+ * iovec - import_iovec will properly treat those as compat_iovecs based on
+ * in_compat_syscall().
+ */
#ifdef CONFIG_COMPAT
-static size_t compat_readv(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
-
- ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter);
- if (ret >= 0) {
- ret = do_iter_read(file, &iter, pos, flags);
- kfree(iov);
- }
- if (ret > 0)
- add_rchar(current, ret);
- inc_syscr(current);
- return ret;
-}
-
-static size_t do_compat_readv(compat_ulong_t fd,
- const struct compat_iovec __user *vec,
- compat_ulong_t vlen, rwf_t flags)
-{
- struct fd f = fdget_pos(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- f.file->f_pos = pos;
- fdput_pos(f);
- return ret;
-
-}
-
-COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
- compat_ulong_t, vlen)
-{
- return do_compat_readv(fd, vec, vlen, 0);
-}
-
-static long do_compat_preadv64(unsigned long fd,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, rwf_t flags)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PREAD)
- ret = compat_readv(f.file, vec, vlen, &pos, flags);
- fdput(f);
- return ret;
-}
-
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos)
{
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ return do_preadv(fd, vec, vlen, pos, 0);
}
#endif
COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return do_compat_preadv64(fd, vec, vlen, pos, 0);
+ return do_preadv(fd, vec, vlen, pos, 0);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
if (pos == -1)
- return do_compat_readv(fd, vec, vlen, flags);
-
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
+ return do_readv(fd, vec, vlen, flags);
+ return do_preadv(fd, vec, vlen, pos, flags);
}
#endif
COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
if (pos == -1)
- return do_compat_readv(fd, vec, vlen, flags);
-
- return do_compat_preadv64(fd, vec, vlen, pos, flags);
-}
-
-static size_t compat_writev(struct file *file,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t ret;
-
- ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter);
- if (ret >= 0) {
- file_start_write(file);
- ret = do_iter_write(file, &iter, pos, flags);
- file_end_write(file);
- kfree(iov);
- }
- if (ret > 0)
- add_wchar(current, ret);
- inc_syscw(current);
- return ret;
-}
-
-static size_t do_compat_writev(compat_ulong_t fd,
- const struct compat_iovec __user* vec,
- compat_ulong_t vlen, rwf_t flags)
-{
- struct fd f = fdget_pos(fd);
- ssize_t ret;
- loff_t pos;
-
- if (!f.file)
- return -EBADF;
- pos = f.file->f_pos;
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
- if (ret >= 0)
- f.file->f_pos = pos;
- fdput_pos(f);
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
- const struct compat_iovec __user *, vec,
- compat_ulong_t, vlen)
-{
- return do_compat_writev(fd, vec, vlen, 0);
-}
-
-static long do_compat_pwritev64(unsigned long fd,
- const struct compat_iovec __user *vec,
- unsigned long vlen, loff_t pos, rwf_t flags)
-{
- struct fd f;
- ssize_t ret;
-
- if (pos < 0)
- return -EINVAL;
- f = fdget(fd);
- if (!f.file)
- return -EBADF;
- ret = -ESPIPE;
- if (f.file->f_mode & FMODE_PWRITE)
- ret = compat_writev(f.file, vec, vlen, &pos, flags);
- fdput(f);
- return ret;
+ return do_readv(fd, vec, vlen, flags);
+ return do_preadv(fd, vec, vlen, pos, flags);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos)
{
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif
COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return do_compat_pwritev64(fd, vec, vlen, pos, 0);
+ return do_pwritev(fd, vec, vlen, pos, 0);
}
#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *, vec,
unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
if (pos == -1)
- return do_compat_writev(fd, vec, vlen, flags);
-
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ return do_writev(fd, vec, vlen, flags);
+ return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif
COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
- const struct compat_iovec __user *,vec,
+ const struct iovec __user *,vec,
compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
if (pos == -1)
- return do_compat_writev(fd, vec, vlen, flags);
-
- return do_compat_pwritev64(fd, vec, vlen, pos, flags);
+ return do_writev(fd, vec, vlen, flags);
+ return do_pwritev(fd, vec, vlen, pos, flags);
}
-
-#endif
+#endif /* CONFIG_COMPAT */
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
diff --git a/fs/splice.c b/fs/splice.c
index d7c8a7c4db07..70cc52af780b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,7 +33,6 @@
#include <linux/security.h>
#include <linux/gfp.h>
#include <linux/socket.h>
-#include <linux/compat.h>
#include <linux/sched/signal.h>
#include "internal.h"
@@ -526,6 +525,22 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
return 1;
}
+/* We know we have a pipe buffer, but maybe it's empty? */
+static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
+{
+ unsigned int tail = pipe->tail;
+ unsigned int mask = pipe->ring_size - 1;
+ struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+
+ if (unlikely(!buf->len)) {
+ pipe_buf_release(pipe, buf);
+ pipe->tail = tail+1;
+ return true;
+ }
+
+ return false;
+}
+
/**
* splice_from_pipe_next - wait for some data to splice from
* @pipe: pipe to splice from
@@ -545,6 +560,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
if (signal_pending(current))
return -ERESTARTSYS;
+repeat:
while (pipe_empty(pipe->head, pipe->tail)) {
if (!pipe->writers)
return 0;
@@ -563,9 +579,12 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
sd->need_wakeup = false;
}
- pipe_wait(pipe);
+ pipe_wait_readable(pipe);
}
+ if (eat_empty_buffer(pipe))
+ goto repeat;
+
return 1;
}
@@ -1077,7 +1096,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
return -EAGAIN;
if (signal_pending(current))
return -ERESTARTSYS;
- pipe_wait(pipe);
+ pipe_wait_writable(pipe);
}
}
@@ -1332,20 +1351,6 @@ static int vmsplice_type(struct fd f, int *type)
* Currently we punt and implement it as a normal copy, see pipe_to_user().
*
*/
-static long do_vmsplice(struct file *f, struct iov_iter *iter, unsigned int flags)
-{
- if (unlikely(flags & ~SPLICE_F_ALL))
- return -EINVAL;
-
- if (!iov_iter_count(iter))
- return 0;
-
- if (iov_iter_rw(iter) == WRITE)
- return vmsplice_to_pipe(f, iter, flags);
- else
- return vmsplice_to_user(f, iter, flags);
-}
-
SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
unsigned long, nr_segs, unsigned int, flags)
{
@@ -1356,6 +1361,9 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
struct fd f;
int type;
+ if (unlikely(flags & ~SPLICE_F_ALL))
+ return -EINVAL;
+
f = fdget(fd);
error = vmsplice_type(f, &type);
if (error)
@@ -1363,40 +1371,21 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
error = import_iovec(type, uiov, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter);
- if (error >= 0) {
- error = do_vmsplice(f.file, &iter, flags);
- kfree(iov);
- }
- fdput(f);
- return error;
-}
-
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
- unsigned int, nr_segs, unsigned int, flags)
-{
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov = iovstack;
- struct iov_iter iter;
- ssize_t error;
- struct fd f;
- int type;
+ if (error < 0)
+ goto out_fdput;
- f = fdget(fd);
- error = vmsplice_type(f, &type);
- if (error)
- return error;
+ if (!iov_iter_count(&iter))
+ error = 0;
+ else if (iov_iter_rw(&iter) == WRITE)
+ error = vmsplice_to_pipe(f.file, &iter, flags);
+ else
+ error = vmsplice_to_user(f.file, &iter, flags);
- error = compat_import_iovec(type, iov32, nr_segs,
- ARRAY_SIZE(iovstack), &iov, &iter);
- if (error >= 0) {
- error = do_vmsplice(f.file, &iter, flags);
- kfree(iov);
- }
+ kfree(iov);
+out_fdput:
fdput(f);
return error;
}
-#endif
SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
@@ -1454,7 +1443,7 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
ret = -EAGAIN;
break;
}
- pipe_wait(pipe);
+ pipe_wait_readable(pipe);
}
pipe_unlock(pipe);
@@ -1493,7 +1482,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
ret = -ERESTARTSYS;
break;
}
- pipe_wait(pipe);
+ pipe_wait_writable(pipe);
}
pipe_unlock(pipe);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9d042942d8b2..155521e51ac5 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -81,19 +81,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
struct ubifs_inode *ui;
bool encrypted = false;
- if (IS_ENCRYPTED(dir)) {
- err = fscrypt_get_encryption_info(dir);
- if (err) {
- ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err);
- return ERR_PTR(err);
- }
-
- if (!fscrypt_has_encryption_key(dir))
- return ERR_PTR(-EPERM);
-
- encrypted = true;
- }
-
inode = new_inode(c->vfs_sb);
ui = ubifs_inode(inode);
if (!inode)
@@ -112,6 +99,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
current_time(inode);
inode->i_mapping->nrpages = 0;
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypted);
+ if (err) {
+ ubifs_err(c, "fscrypt_prepare_new_inode failed: %i", err);
+ goto out_iput;
+ }
+
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -131,7 +124,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
case S_IFBLK:
case S_IFCHR:
inode->i_op = &ubifs_file_inode_operations;
- encrypted = false;
break;
default:
BUG();
@@ -151,9 +143,8 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
if (c->highest_inum >= INUM_WATERMARK) {
spin_unlock(&c->cnt_lock);
ubifs_err(c, "out of inode numbers");
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(-EINVAL);
+ err = -EINVAL;
+ goto out_iput;
}
ubifs_warn(c, "running out of inode numbers (current %lu, max %u)",
(unsigned long)c->highest_inum, INUM_WATERMARK);
@@ -171,16 +162,19 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
spin_unlock(&c->cnt_lock);
if (encrypted) {
- err = fscrypt_inherit_context(dir, inode, &encrypted, true);
+ err = fscrypt_set_context(inode, NULL);
if (err) {
- ubifs_err(c, "fscrypt_inherit_context failed: %i", err);
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(err);
+ ubifs_err(c, "fscrypt_set_context failed: %i", err);
+ goto out_iput;
}
}
return inode;
+
+out_iput:
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
}
static int dbg_check_name(const struct ubifs_info *c,
@@ -515,7 +509,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
if (err)
return err;
- err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr);
+ err = fscrypt_fname_alloc_buffer(UBIFS_MAX_NLEN, &fstr);
if (err)
return err;
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 8e3792177a85..d7816c01a4f6 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -386,7 +386,7 @@ fail_nomem:
static int vboxsf_parse_monolithic(struct fs_context *fc, void *data)
{
- char *options = data;
+ unsigned char *options = data;
if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 &&
options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 &&
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 8623c815164a..305d4bc07337 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -653,8 +653,8 @@ xfs_attr_shortform_create(
ASSERT(ifp->if_flags & XFS_IFINLINE);
}
xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
- hdr = (xfs_attr_sf_hdr_t *)ifp->if_u1.if_data;
- hdr->count = 0;
+ hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+ memset(hdr, 0, sizeof(*hdr));
hdr->totsize = cpu_to_be16(sizeof(*hdr));
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
@@ -1036,8 +1036,10 @@ xfs_attr_shortform_verify(
* struct xfs_attr_sf_entry has a variable length.
* Check the fixed-offset parts of the structure are
* within the data buffer.
+ * xfs_attr_sf_entry is defined with a 1-byte variable
+ * array at the end, so we must subtract that off.
*/
- if (((char *)sfep + sizeof(*sfep)) >= endp)
+ if (((char *)sfep + sizeof(*sfep) - 1) >= endp)
return __this_address;
/* Don't allow names with known bad length. */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9c40d5971035..1b0a01b06a05 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6226,7 +6226,7 @@ xfs_bmap_validate_extent(
isrt = XFS_IS_REALTIME_INODE(ip);
endfsb = irec->br_startblock + irec->br_blockcount - 1;
- if (isrt) {
+ if (isrt && whichfork == XFS_DATA_FORK) {
if (!xfs_verify_rtbno(mp, irec->br_startblock))
return __this_address;
if (!xfs_verify_rtbno(mp, endfsb))
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f742a96a2fe1..a6b37db55169 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -688,7 +688,7 @@ xfs_ialloc_ag_alloc(
args.minalignslop = igeo->cluster_align - 1;
/* Allow space for the inode btree to split. */
- args.minleft = igeo->inobt_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels;
if ((error = xfs_alloc_vextent(&args)))
return error;
@@ -736,7 +736,7 @@ xfs_ialloc_ag_alloc(
/*
* Allow space for the inode btree to split.
*/
- args.minleft = igeo->inobt_maxlevels - 1;
+ args.minleft = igeo->inobt_maxlevels;
if ((error = xfs_alloc_vextent(&args)))
return error;
}
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index c6df01a2a158..7ad3659c5d2a 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -58,7 +58,7 @@
#define XFS_IALLOC_SPACE_RES(mp) \
(M_IGEO(mp)->ialloc_blks + \
((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \
- (M_IGEO(mp)->inobt_maxlevels - 1)))
+ M_IGEO(mp)->inobt_maxlevels))
/*
* Space reservation values for various transactions.
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 73cafc843cd7..5123f82f2477 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1165,7 +1165,7 @@ xfs_insert_file_space(
goto out_trans_cancel;
do {
- error = xfs_trans_roll_inode(&tp, ip);
+ error = xfs_defer_finish(&tp);
if (error)
goto out_trans_cancel;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c31cd3be9fb2..a29f78a663ca 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1223,6 +1223,14 @@ __xfs_filemap_fault(
return ret;
}
+static inline bool
+xfs_is_write_fault(
+ struct vm_fault *vmf)
+{
+ return (vmf->flags & FAULT_FLAG_WRITE) &&
+ (vmf->vma->vm_flags & VM_SHARED);
+}
+
static vm_fault_t
xfs_filemap_fault(
struct vm_fault *vmf)
@@ -1230,7 +1238,7 @@ xfs_filemap_fault(
/* DAX can shortcut the normal fault path on write faults! */
return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
IS_DAX(file_inode(vmf->vma->vm_file)) &&
- (vmf->flags & FAULT_FLAG_WRITE));
+ xfs_is_write_fault(vmf));
}
static vm_fault_t
@@ -1243,7 +1251,7 @@ xfs_filemap_huge_fault(
/* DAX can shortcut the normal fault path on write faults! */
return __xfs_filemap_fault(vmf, pe_size,
- (vmf->flags & FAULT_FLAG_WRITE));
+ xfs_is_write_fault(vmf));
}
static vm_fault_t