summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c1
-rw-r--r--fs/9p/v9fs.c1
-rw-r--r--fs/9p/vfs_addr.c1
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_dir.c1
-rw-r--r--fs/9p/vfs_file.c1
-rw-r--r--fs/9p/vfs_inode.c1
-rw-r--r--fs/9p/vfs_inode_dotl.c1
-rw-r--r--fs/9p/vfs_super.c1
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/afs/cmservice.c6
-rw-r--r--fs/afs/dir.c1
-rw-r--r--fs/afs/file.c3
-rw-r--r--fs/afs/fs_probe.c5
-rw-r--r--fs/afs/internal.h8
-rw-r--r--fs/afs/rxrpc.c24
-rw-r--r--fs/afs/volume.c6
-rw-r--r--fs/afs/write.c83
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/binfmt_elf_fdpic.c4
-rw-r--r--fs/btrfs/backref.c4
-rw-r--r--fs/btrfs/bio.c11
-rw-r--r--fs/btrfs/defrag.c6
-rw-r--r--fs/btrfs/disk-io.c20
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-io-tree.c2
-rw-r--r--fs/btrfs/extent-tree.c7
-rw-r--r--fs/btrfs/extent_io.c30
-rw-r--r--fs/btrfs/file.c15
-rw-r--r--fs/btrfs/fs.h6
-rw-r--r--fs/btrfs/inode.c8
-rw-r--r--fs/btrfs/qgroup.c40
-rw-r--r--fs/btrfs/raid56.c2
-rw-r--r--fs/btrfs/send.c11
-rw-r--r--fs/btrfs/space-info.c3
-rw-r--r--fs/btrfs/super.c3
-rw-r--r--fs/btrfs/tree-log.c52
-rw-r--r--fs/btrfs/volumes.c61
-rw-r--r--fs/btrfs/zoned.c2
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/caps.c27
-rw-r--r--fs/ceph/file.c26
-rw-r--r--fs/ceph/inode.c6
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/locks.c24
-rw-r--r--fs/ceph/super.h6
-rw-r--r--fs/char_dev.c15
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifs_debug.c8
-rw-r--r--fs/cifs/cifs_dfs_ref.c255
-rw-r--r--fs/cifs/cifs_ioctl.h2
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c14
-rw-r--r--fs/cifs/cifsfs.h8
-rw-r--r--fs/cifs/cifsglob.h94
-rw-r--r--fs/cifs/cifspdu.h50
-rw-r--r--fs/cifs/cifsproto.h20
-rw-r--r--fs/cifs/connect.c909
-rw-r--r--fs/cifs/dfs.c543
-rw-r--r--fs/cifs/dfs.h46
-rw-r--r--fs/cifs/dfs_cache.c508
-rw-r--r--fs/cifs/dfs_cache.h5
-rw-r--r--fs/cifs/dir.c21
-rw-r--r--fs/cifs/dns_resolve.c49
-rw-r--r--fs/cifs/dns_resolve.h4
-rw-r--r--fs/cifs/file.c36
-rw-r--r--fs/cifs/fs_context.c24
-rw-r--r--fs/cifs/fs_context.h3
-rw-r--r--fs/cifs/inode.c19
-rw-r--r--fs/cifs/link.c1
-rw-r--r--fs/cifs/misc.c87
-rw-r--r--fs/cifs/sess.c5
-rw-r--r--fs/cifs/smb1ops.c63
-rw-r--r--fs/cifs/smb2file.c4
-rw-r--r--fs/cifs/smb2inode.c112
-rw-r--r--fs/cifs/smb2ops.c211
-rw-r--r--fs/cifs/smb2pdu.c32
-rw-r--r--fs/cifs/smb2proto.h5
-rw-r--r--fs/coredump.c7
-rw-r--r--fs/dlm/lowcomms.c2
-rw-r--r--fs/erofs/super.c13
-rw-r--r--fs/erofs/zdata.c12
-rw-r--r--fs/erofs/zmap.c10
-rw-r--r--fs/exfat/dir.c184
-rw-r--r--fs/exfat/exfat_fs.h56
-rw-r--r--fs/exfat/file.c12
-rw-r--r--fs/exfat/inode.c17
-rw-r--r--fs/exfat/namei.c63
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/ext4/xattr.c40
-rw-r--r--fs/f2fs/checkpoint.c9
-rw-r--r--fs/f2fs/compress.c48
-rw-r--r--fs/f2fs/data.c56
-rw-r--r--fs/f2fs/debug.c131
-rw-r--r--fs/f2fs/dir.c36
-rw-r--r--fs/f2fs/extent_cache.c697
-rw-r--r--fs/f2fs/f2fs.h278
-rw-r--r--fs/f2fs/file.c46
-rw-r--r--fs/f2fs/gc.c79
-rw-r--r--fs/f2fs/inode.c20
-rw-r--r--fs/f2fs/namei.c391
-rw-r--r--fs/f2fs/node.c19
-rw-r--r--fs/f2fs/node.h3
-rw-r--r--fs/f2fs/recovery.c4
-rw-r--r--fs/f2fs/segment.c212
-rw-r--r--fs/f2fs/segment.h6
-rw-r--r--fs/f2fs/shrinker.c25
-rw-r--r--fs/f2fs/super.c126
-rw-r--r--fs/f2fs/sysfs.c164
-rw-r--r--fs/fs-writeback.c17
-rw-r--r--fs/fuse/acl.c68
-rw-r--r--fs/fuse/dir.c6
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/fuse/inode.c21
-rw-r--r--fs/fuse/xattr.c51
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/file.c3
-rw-r--r--fs/gfs2/glock.c269
-rw-r--r--fs/gfs2/glock.h65
-rw-r--r--fs/gfs2/glops.c44
-rw-r--r--fs/gfs2/incore.h1
-rw-r--r--fs/gfs2/inode.c64
-rw-r--r--fs/gfs2/log.c11
-rw-r--r--fs/gfs2/meta_io.c6
-rw-r--r--fs/gfs2/super.c84
-rw-r--r--fs/gfs2/xattr.c26
-rw-r--r--fs/hfs/inode.c15
-rw-r--r--fs/iomap/buffered-io.c254
-rw-r--r--fs/iomap/iter.c19
-rw-r--r--fs/kernfs/dir.c106
-rw-r--r--fs/kernfs/file.c18
-rw-r--r--fs/kernfs/inode.c12
-rw-r--r--fs/kernfs/kernfs-internal.h2
-rw-r--r--fs/kernfs/mount.c10
-rw-r--r--fs/kernfs/symlink.c2
-rw-r--r--fs/ksmbd/auth.c3
-rw-r--r--fs/ksmbd/connection.c7
-rw-r--r--fs/ksmbd/ksmbd_netlink.h1
-rw-r--r--fs/ksmbd/mgmt/user_session.c8
-rw-r--r--fs/ksmbd/server.c20
-rw-r--r--fs/ksmbd/smb2ops.c10
-rw-r--r--fs/ksmbd/smb2pdu.c27
-rw-r--r--fs/ksmbd/smb2pdu.h2
-rw-r--r--fs/ksmbd/smb_common.c2
-rw-r--r--fs/ksmbd/smb_common.h12
-rw-r--r--fs/ksmbd/transport_tcp.c5
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/filelayout/filelayout.c8
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c4
-rw-r--r--fs/nfs/nfs4idmap.c2
-rw-r--r--fs/nfs/sysfs.c4
-rw-r--r--fs/nfsd/filecache.c367
-rw-r--r--fs/nfsd/filecache.h5
-rw-r--r--fs/nfsd/netns.h2
-rw-r--r--fs/nfsd/nfs4callback.c6
-rw-r--r--fs/nfsd/nfs4proc.c28
-rw-r--r--fs/nfsd/nfs4state.c46
-rw-r--r--fs/nfsd/nfs4xdr.c13
-rw-r--r--fs/nfsd/nfsctl.c7
-rw-r--r--fs/nfsd/nfsd.h6
-rw-r--r--fs/nfsd/nfsproc.c4
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/trace.h95
-rw-r--r--fs/nilfs2/btree.c15
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/ntfs3/attrib.c392
-rw-r--r--fs/ntfs3/attrlist.c5
-rw-r--r--fs/ntfs3/bitfunc.c4
-rw-r--r--fs/ntfs3/bitmap.c168
-rw-r--r--fs/ntfs3/dir.c4
-rw-r--r--fs/ntfs3/file.c207
-rw-r--r--fs/ntfs3/frecord.c40
-rw-r--r--fs/ntfs3/fslog.c62
-rw-r--r--fs/ntfs3/fsntfs.c190
-rw-r--r--fs/ntfs3/index.c127
-rw-r--r--fs/ntfs3/inode.c203
-rw-r--r--fs/ntfs3/namei.c238
-rw-r--r--fs/ntfs3/ntfs.h6
-rw-r--r--fs/ntfs3/ntfs_fs.h41
-rw-r--r--fs/ntfs3/record.c13
-rw-r--r--fs/ntfs3/run.c28
-rw-r--r--fs/ntfs3/super.c143
-rw-r--r--fs/ntfs3/upcase.c12
-rw-r--r--fs/ntfs3/xattr.c158
-rw-r--r--fs/ocfs2/cluster/tcp.c1
-rw-r--r--fs/orangefs/file.c1
-rw-r--r--fs/orangefs/inode.c2
-rw-r--r--fs/orangefs/orangefs-debugfs.c29
-rw-r--r--fs/orangefs/orangefs-mod.c8
-rw-r--r--fs/orangefs/orangefs-sysfs.c71
-rw-r--r--fs/overlayfs/copy_up.c6
-rw-r--r--fs/pnode.c2
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/pmsg.c9
-rw-r--r--fs/pstore/ram.c2
-rw-r--r--fs/udf/inode.c6
-rw-r--r--fs/userfaultfd.c28
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c8
-rw-r--r--fs/xfs/libxfs/xfs_btree.c7
-rw-r--r--fs/xfs/libxfs/xfs_btree.h1
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h18
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c146
-rw-r--r--fs/xfs/libxfs/xfs_sb.c4
-rw-r--r--fs/xfs/scrub/agheader.c47
-rw-r--r--fs/xfs/scrub/agheader_repair.c81
-rw-r--r--fs/xfs/scrub/attr.c11
-rw-r--r--fs/xfs/scrub/bitmap.c11
-rw-r--r--fs/xfs/scrub/bmap.c147
-rw-r--r--fs/xfs/scrub/btree.c14
-rw-r--r--fs/xfs/scrub/common.c48
-rw-r--r--fs/xfs/scrub/common.h2
-rw-r--r--fs/xfs/scrub/dabtree.c4
-rw-r--r--fs/xfs/scrub/dir.c10
-rw-r--r--fs/xfs/scrub/fscounters.c109
-rw-r--r--fs/xfs/scrub/inode.c2
-rw-r--r--fs/xfs/scrub/quota.c8
-rw-r--r--fs/xfs/scrub/refcount.c12
-rw-r--r--fs/xfs/scrub/repair.c51
-rw-r--r--fs/xfs/scrub/scrub.c6
-rw-r--r--fs/xfs/scrub/scrub.h18
-rw-r--r--fs/xfs/scrub/symlink.c2
-rw-r--r--fs/xfs/xfs_aops.c32
-rw-r--r--fs/xfs/xfs_bmap_util.c10
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c1
-rw-r--r--fs/xfs/xfs_buf_item.c2
-rw-r--r--fs/xfs/xfs_error.c46
-rw-r--r--fs/xfs/xfs_error.h13
-rw-r--r--fs/xfs/xfs_extent_busy.c1
-rw-r--r--fs/xfs/xfs_file.c2
-rw-r--r--fs/xfs/xfs_fsmap.c4
-rw-r--r--fs/xfs/xfs_icache.c16
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_ioctl.c4
-rw-r--r--fs/xfs/xfs_iomap.c185
-rw-r--r--fs/xfs/xfs_iomap.h6
-rw-r--r--fs/xfs/xfs_log.c46
-rw-r--r--fs/xfs/xfs_mount.c15
-rw-r--r--fs/xfs/xfs_pnfs.c6
-rw-r--r--fs/xfs/xfs_qm.c18
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_rtalloc.c60
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h86
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/zonefs/super.c22
252 files changed, 7165 insertions, 4769 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 23cf9b2fbfe4..805151114e96 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -11,7 +11,6 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/idr.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 0129de2ea31a..3a9c4517265f 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -14,7 +14,6 @@
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/parser.h>
-#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <net/9p/9p.h>
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index a19891015f19..97599edbc300 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -14,7 +14,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/pagemap.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/swap.h>
#include <linux/uio.h>
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f89f01734587..65fa2df5e49b 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -15,7 +15,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/namei.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <net/9p/9p.h>
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 3bb95adc9619..59b0e8948f78 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -14,7 +14,6 @@
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/inet.h>
-#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/uio.h>
#include <linux/fscache.h>
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index aec43ba83799..b740017634ef 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -18,7 +18,6 @@
#include <linux/pagemap.h>
#include <linux/utsname.h>
#include <linux/uaccess.h>
-#include <linux/idr.h>
#include <linux/uio.h>
#include <linux/slab.h>
#include <net/9p/9p.h>
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4d1a4a8d9277..27a04a226d97 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -17,7 +17,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/namei.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/xattr.h>
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 03c1743c4aff..f806b3f11649 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -15,7 +15,6 @@
#include <linux/string.h>
#include <linux/inet.h>
#include <linux/namei.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/xattr.h>
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2d9ee073d12c..266c4693e20c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -15,7 +15,6 @@
#include <linux/inet.h>
#include <linux/pagemap.h>
#include <linux/mount.h>
-#include <linux/idr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/statfs.h>
diff --git a/fs/affs/file.c b/fs/affs/file.c
index cefa222f7881..8daeed31e1af 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -880,7 +880,7 @@ affs_truncate(struct inode *inode)
if (inode->i_size > AFFS_I(inode)->mmu_private) {
struct address_space *mapping = inode->i_mapping;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
loff_t isize = inode->i_size;
int res;
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 7dcd59693a0c..d4ddb20d6732 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -13,6 +13,8 @@
#include "internal.h"
#include "afs_cm.h"
#include "protocol_yfs.h"
+#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
+#include <trace/events/rxrpc.h>
static int afs_deliver_cb_init_call_back_state(struct afs_call *);
static int afs_deliver_cb_init_call_back_state3(struct afs_call *);
@@ -191,7 +193,7 @@ static void afs_cm_destructor(struct afs_call *call)
* Abort a service call from within an action function.
*/
static void afs_abort_service_call(struct afs_call *call, u32 abort_code, int error,
- const char *why)
+ enum rxrpc_abort_reason why)
{
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, error, why);
@@ -469,7 +471,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
afs_send_empty_reply(call);
else
- afs_abort_service_call(call, 1, 1, "K-1");
+ afs_abort_service_call(call, 1, 1, afs_abort_probeuuid_negative);
afs_put_call(call);
_leave("");
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 104df2964225..b7c1f8c84b38 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -77,6 +77,7 @@ const struct address_space_operations afs_dir_aops = {
.dirty_folio = afs_dir_dirty_folio,
.release_folio = afs_dir_release_folio,
.invalidate_folio = afs_dir_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
};
const struct dentry_operations afs_fs_dentry_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 2eeab57df133..68d6d5dc608d 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -58,14 +58,15 @@ const struct address_space_operations afs_file_aops = {
.invalidate_folio = afs_invalidate_folio,
.write_begin = afs_write_begin,
.write_end = afs_write_end,
- .writepage = afs_writepage,
.writepages = afs_writepages,
+ .migrate_folio = filemap_migrate_folio,
};
const struct address_space_operations afs_symlink_aops = {
.read_folio = afs_symlink_read_folio,
.release_folio = afs_release_folio,
.invalidate_folio = afs_invalidate_folio,
+ .migrate_folio = filemap_migrate_folio,
};
static const struct vm_operations_struct afs_vm_ops = {
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 3ac5fcf98d0d..daaf3810cc92 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -366,12 +366,15 @@ void afs_fs_probe_dispatcher(struct work_struct *work)
unsigned long nowj, timer_at, poll_at;
bool first_pass = true, set_timer = false;
- if (!net->live)
+ if (!net->live) {
+ afs_dec_servers_outstanding(net);
return;
+ }
_enter("");
if (list_empty(&net->fs_probe_fast) && list_empty(&net->fs_probe_slow)) {
+ afs_dec_servers_outstanding(net);
_leave(" [none]");
return;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 9ba7b68375c9..fd8567b98e2b 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -972,13 +972,6 @@ extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
/*
- * cache.c
- */
-#ifdef CONFIG_AFS_FSCACHE
-extern struct fscache_netfs afs_cache_netfs;
-#endif
-
-/*
* callback.c
*/
extern void afs_invalidate_mmap_work(struct work_struct *);
@@ -1391,7 +1384,6 @@ extern void afs_put_permits(struct afs_permits *);
extern void afs_clear_permits(struct afs_vnode *);
extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int,
struct afs_status_cb *);
-extern void afs_zap_permits(struct rcu_head *);
extern struct key *afs_request_key(struct afs_cell *);
extern struct key *afs_request_key_rcu(struct afs_cell *);
extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *);
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index c62939e5ea1f..7817e2b860e5 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -13,6 +13,8 @@
#include "internal.h"
#include "afs_cm.h"
#include "protocol_yfs.h"
+#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
+#include <trace/events/rxrpc.h>
struct workqueue_struct *afs_async_calls;
@@ -397,7 +399,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
error_do_abort:
if (ret != -ECONNABORTED) {
rxrpc_kernel_abort_call(call->net->socket, rxcall,
- RX_USER_ABORT, ret, "KSD");
+ RX_USER_ABORT, ret,
+ afs_abort_send_data_error);
} else {
len = 0;
iov_iter_kvec(&msg.msg_iter, ITER_DEST, NULL, 0, 0);
@@ -527,7 +530,8 @@ static void afs_deliver_to_call(struct afs_call *call)
case -ENOTSUPP:
abort_code = RXGEN_OPCODE;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, ret, "KIV");
+ abort_code, ret,
+ afs_abort_op_not_supported);
goto local_abort;
case -EIO:
pr_err("kAFS: Call %u in bad state %u\n",
@@ -542,12 +546,14 @@ static void afs_deliver_to_call(struct afs_call *call)
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, ret, "KUM");
+ abort_code, ret,
+ afs_abort_unmarshal_error);
goto local_abort;
default:
abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- abort_code, ret, "KER");
+ abort_code, ret,
+ afs_abort_general_error);
goto local_abort;
}
}
@@ -619,7 +625,8 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
/* Kill off the call if it's still live. */
_debug("call interrupted");
if (rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- RX_USER_ABORT, -EINTR, "KWI"))
+ RX_USER_ABORT, -EINTR,
+ afs_abort_interrupted))
afs_set_call_complete(call, -EINTR, 0);
}
}
@@ -836,7 +843,8 @@ void afs_send_empty_reply(struct afs_call *call)
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM,
+ afs_abort_oom);
fallthrough;
default:
_leave(" [error]");
@@ -878,7 +886,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM,
+ afs_abort_oom);
}
_leave(" [error]");
}
@@ -900,6 +909,7 @@ int afs_extract_data(struct afs_call *call, bool want_more)
ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter,
&call->iov_len, want_more, &remote_abort,
&call->service_id);
+ trace_afs_receive_data(call, call->iter, want_more, ret);
if (ret == 0 || ret == -EAGAIN)
return ret;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index f4937029dcd7..29d483c80281 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -70,11 +70,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
{
struct afs_server_list *slist;
struct afs_volume *volume;
- int ret = -ENOMEM, nr_servers = 0, i;
-
- for (i = 0; i < vldb->nr_servers; i++)
- if (vldb->fs_mask[i] & type_mask)
- nr_servers++;
+ int ret = -ENOMEM;
volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
if (!volume)
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 08fd456dde67..19df10d63323 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,11 @@
#include <linux/netfs.h>
#include "internal.h"
+static int afs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ loff_t start, loff_t end, loff_t *_next,
+ bool max_one_loop);
+
static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len,
loff_t i_size, bool caching);
@@ -39,6 +44,25 @@ static void afs_folio_start_fscache(bool caching, struct folio *folio)
#endif
/*
+ * Flush out a conflicting write. This may extend the write to the surrounding
+ * pages if also dirty and contiguous to the conflicting region..
+ */
+static int afs_flush_conflicting_write(struct address_space *mapping,
+ struct folio *folio)
+{
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = LONG_MAX,
+ .range_start = folio_pos(folio),
+ .range_end = LLONG_MAX,
+ };
+ loff_t next;
+
+ return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX,
+ &next, true);
+}
+
+/*
* prepare to perform part of a write to a page
*/
int afs_write_begin(struct file *file, struct address_space *mapping,
@@ -80,7 +104,8 @@ try_again:
if (folio_test_writeback(folio)) {
trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio);
- goto flush_conflicting_write;
+ folio_unlock(folio);
+ goto wait_for_writeback;
}
/* If the file is being filled locally, allow inter-write
* spaces to be merged into writes. If it's not, only write
@@ -99,8 +124,15 @@ try_again:
* flush the page out.
*/
flush_conflicting_write:
- _debug("flush conflict");
- ret = folio_write_one(folio);
+ trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio);
+ folio_unlock(folio);
+
+ ret = afs_flush_conflicting_write(mapping, folio);
+ if (ret < 0)
+ goto error;
+
+wait_for_writeback:
+ ret = folio_wait_writeback_killable(folio);
if (ret < 0)
goto error;
@@ -664,39 +696,12 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
}
/*
- * write a page back to the server
- * - the caller locked the page for us
- */
-int afs_writepage(struct page *subpage, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(subpage);
- ssize_t ret;
- loff_t start;
-
- _enter("{%lx},", folio_index(folio));
-
-#ifdef CONFIG_AFS_FSCACHE
- folio_wait_fscache(folio);
-#endif
-
- start = folio_index(folio) * PAGE_SIZE;
- ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc,
- folio, start, LLONG_MAX - start);
- if (ret < 0) {
- _leave(" = %zd", ret);
- return ret;
- }
-
- _leave(" = 0");
- return 0;
-}
-
-/*
* write a region of pages back to the server
*/
static int afs_writepages_region(struct address_space *mapping,
struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next)
+ loff_t start, loff_t end, loff_t *_next,
+ bool max_one_loop)
{
struct folio *folio;
struct page *head_page;
@@ -775,6 +780,9 @@ static int afs_writepages_region(struct address_space *mapping,
start += ret;
+ if (max_one_loop)
+ break;
+
cond_resched();
} while (wbc->nr_to_write > 0);
@@ -806,24 +814,27 @@ int afs_writepages(struct address_space *mapping,
if (wbc->range_cyclic) {
start = mapping->writeback_index * PAGE_SIZE;
- ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
+ ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX,
+ &next, false);
if (ret == 0) {
mapping->writeback_index = next / PAGE_SIZE;
if (start > 0 && wbc->nr_to_write > 0) {
ret = afs_writepages_region(mapping, wbc, 0,
- start, &next);
+ start, &next, false);
if (ret == 0)
mapping->writeback_index =
next / PAGE_SIZE;
}
}
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
+ ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX,
+ &next, false);
if (wbc->nr_to_write > 0 && ret == 0)
mapping->writeback_index = next / PAGE_SIZE;
} else {
ret = afs_writepages_region(mapping, wbc,
- wbc->range_start, wbc->range_end, &next);
+ wbc->range_start, wbc->range_end,
+ &next, false);
}
up_read(&vnode->validate_lock);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index de63572a9404..9a780fafc539 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2034,7 +2034,7 @@ static int elf_core_dump(struct coredump_params *cprm)
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = cprm->vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs(cprm);
/* for notes section */
segs++;
@@ -2074,7 +2074,7 @@ static int elf_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
offset += cprm->vma_data_size;
- offset += elf_core_extra_data_size();
+ offset += elf_core_extra_data_size(cprm);
e_shoff = offset;
if (e_phnum == PN_XNUM) {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 096e3520a0b1..a05eafcacfb2 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1509,7 +1509,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
tmp->next = thread_list;
thread_list = tmp;
- segs = cprm->vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs(cprm);
/* for notes section */
segs++;
@@ -1555,7 +1555,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
offset += cprm->vma_data_size;
- offset += elf_core_extra_data_size();
+ offset += elf_core_extra_data_size(cprm);
e_shoff = offset;
if (e_phnum == PN_XNUM) {
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 21c92c74bf71..46851511b661 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -484,6 +484,7 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx,
u64 wanted_disk_byte = ref->wanted_disk_byte;
u64 count = 0;
u64 data_offset;
+ u8 type;
if (level != 0) {
eb = path->nodes[level];
@@ -538,6 +539,9 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx,
continue;
}
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(eb, fi);
+ if (type == BTRFS_FILE_EXTENT_INLINE)
+ goto next;
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
data_offset = btrfs_file_extent_offset(eb, fi);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index b8fb7ef6b520..8affc88b0e0a 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -329,7 +329,16 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
&map_length, &bioc, mirror_num);
if (ret)
goto out_counter_dec;
- BUG_ON(mirror_num != bioc->mirror_num);
+ /*
+ * This happens when dev-replace is also running, and the
+ * mirror_num indicates the dev-replace target.
+ *
+ * In this case, we don't need to do anything, as the read
+ * error just means the replace progress hasn't reached our
+ * read range, and later replace routine would handle it well.
+ */
+ if (mirror_num != bioc->mirror_num)
+ goto out_counter_dec;
}
sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 0a3c261b69c9..d81b764a7644 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -358,8 +358,10 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
level = btrfs_header_level(root->node);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0888d484df80..3aa04224315e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -367,7 +367,14 @@ error:
btrfs_print_tree(eb, 0);
btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ /*
+ * Be noisy if this is an extent buffer from a log tree. We don't abort
+ * a transaction in case there's a bad log tree extent buffer, we just
+ * fallback to a transaction commit. Still we want to know when there is
+ * a bad log tree extent buffer, as that may signal a bug somewhere.
+ */
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
+ btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
return ret;
}
@@ -530,6 +537,9 @@ static int validate_extent_buffer(struct extent_buffer *eb,
}
if (found_level != check->level) {
+ btrfs_err(fs_info,
+ "level verify failed on logical %llu mirror %u wanted %u found %u",
+ eb->start, eb->read_mirror, check->level, found_level);
ret = -EIO;
goto out;
}
@@ -3381,6 +3391,8 @@ out:
/*
* Do various sanity and dependency checks of different features.
*
+ * @is_rw_mount: If the mount is read-write.
+ *
* This is the place for less strict checks (like for subpage or artificial
* feature dependencies).
*
@@ -3391,7 +3403,7 @@ out:
* (space cache related) can modify on-disk format like free space tree and
* screw up certain feature dependencies.
*/
-int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
+int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
{
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 incompat = btrfs_super_incompat_flags(disk_super);
@@ -3430,7 +3442,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb)
if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- if (compat_ro_unsupp && !sb_rdonly(sb)) {
+ if (compat_ro_unsupp && is_rw_mount) {
btrfs_err(fs_info,
"cannot mount read-write because of unknown compat_ro features (0x%llx)",
compat_ro);
@@ -3633,7 +3645,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_alloc;
}
- ret = btrfs_check_features(fs_info, sb);
+ ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
if (ret < 0) {
err = ret;
goto fail_alloc;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 363935cfc084..f2f295eb6103 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -50,7 +50,7 @@ int __cold open_ctree(struct super_block *sb,
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
struct btrfs_super_block *sb, int mirror_num);
-int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb);
+int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 9ae9cd1e7035..3c7766dfaa69 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -1551,7 +1551,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 last = 0;
int found = 0;
- if (WARN_ON(search_end <= cur_start))
+ if (WARN_ON(search_end < cur_start))
return 0;
spin_lock(&tree->lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 892d78c1853c..72ba13b027a9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1713,6 +1713,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
BUG();
if (ret && insert_reserved)
btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
+ if (ret < 0)
+ btrfs_err(trans->fs_info,
+"failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d",
+ node->bytenr, node->num_bytes, node->type,
+ node->action, node->ref_mod, ret);
return ret;
}
@@ -1954,8 +1959,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
if (ret) {
unselect_delayed_ref_head(delayed_refs, locked_ref);
btrfs_put_delayed_ref(ref);
- btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
- ret);
return ret;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 83dd3aa59663..9bd32daa9b9a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -104,6 +104,15 @@ struct btrfs_bio_ctrl {
btrfs_bio_end_io_t end_io_func;
/*
+ * This is for metadata read, to provide the extra needed verification
+ * info. This has to be provided for submit_one_bio(), as
+ * submit_one_bio() can submit a bio if it ends at stripe boundary. If
+ * no such parent_check is provided, the metadata can hit false alert at
+ * endio time.
+ */
+ struct btrfs_tree_parent_check *parent_check;
+
+ /*
* Tell writepage not to lock the state bits for this range, it still
* does the unlocking.
*/
@@ -133,13 +142,24 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
- if (!is_data_inode(&inode->vfs_inode))
+ if (!is_data_inode(&inode->vfs_inode)) {
+ if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
+ /*
+ * For metadata read, we should have the parent_check,
+ * and copy it to bbio for metadata verification.
+ */
+ ASSERT(bio_ctrl->parent_check);
+ memcpy(&btrfs_bio(bio)->parent_check,
+ bio_ctrl->parent_check,
+ sizeof(struct btrfs_tree_parent_check));
+ }
btrfs_submit_metadata_bio(inode, bio, mirror_num);
- else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
+ } else if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
btrfs_submit_data_write_bio(inode, bio, mirror_num);
- else
+ } else {
btrfs_submit_data_read_bio(inode, bio, mirror_num,
bio_ctrl->compress_type);
+ }
/* The bio is owned by the end_io handler now */
bio_ctrl->bio = NULL;
@@ -4829,6 +4849,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = {
.mirror_num = mirror_num,
+ .parent_check = check,
};
int ret = 0;
@@ -4878,7 +4899,6 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
*/
atomic_dec(&eb->io_pages);
}
- memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE) {
free_extent_state(cached_state);
@@ -4905,6 +4925,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
unsigned long num_reads = 0;
struct btrfs_bio_ctrl bio_ctrl = {
.mirror_num = mirror_num,
+ .parent_check = check,
};
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
@@ -4996,7 +5017,6 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
}
}
- memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check));
submit_one_bio(&bio_ctrl);
if (ret || wait != WAIT_COMPLETE)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 91b00eb2440e..af046d22300e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3354,7 +3354,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
bool search_io_tree = true;
bool ret = false;
- while (cur_offset < end) {
+ while (cur_offset <= end) {
u64 delalloc_start;
u64 delalloc_end;
bool delalloc;
@@ -3541,6 +3541,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_file_extent_item *extent;
u64 extent_end;
+ u8 type;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
@@ -3596,10 +3597,16 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, extent);
- if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
- btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_PREALLOC) {
+ /*
+ * Can't access the extent's disk_bytenr field if this is an
+ * inline extent, since at that offset, it's where the extent
+ * data starts.
+ */
+ if (type == BTRFS_FILE_EXTENT_PREALLOC ||
+ (type == BTRFS_FILE_EXTENT_REG &&
+ btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
/*
* Explicit hole or prealloc extent, search for delalloc.
* A prealloc extent is treated like a hole.
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index a749367e5ae2..37b86acfcbcf 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -119,6 +119,12 @@ enum {
/* Indicate that we want to commit the transaction. */
BTRFS_FS_NEED_TRANS_COMMIT,
+ /*
+ * Indicate metadata over-commit is disabled. This is set when active
+ * zone tracking is needed.
+ */
+ BTRFS_FS_NO_OVERCOMMIT,
+
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bcad9940154..98a800b8bd43 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7092,7 +7092,7 @@ next:
* Other members are not utilized for inline extents.
*/
ASSERT(em->block_start == EXTENT_MAP_INLINE);
- ASSERT(em->len = fs_info->sectorsize);
+ ASSERT(em->len == fs_info->sectorsize);
ret = read_inline_extent(inode, path, page);
if (ret < 0)
@@ -9377,8 +9377,10 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
if (flags & RENAME_WHITEOUT) {
whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir);
- if (!whiteout_args.inode)
- return -ENOMEM;
+ if (!whiteout_args.inode) {
+ ret = -ENOMEM;
+ goto out_fscrypt_names;
+ }
ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
if (ret)
goto out_whiteout_inode;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5c636e00d77d..af97413abcf4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2765,9 +2765,19 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
/*
* Old roots should be searched when inserting qgroup
- * extent record
+ * extent record.
+ *
+ * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
+ * we may have some record inserted during
+ * NO_ACCOUNTING (thus no old_roots populated), but
+ * later we start rescan, which clears NO_ACCOUNTING,
+ * leaving some inserted records without old_roots
+ * populated.
+ *
+ * Those cases are rare and should not cause too much
+ * time spent during commit_transaction().
*/
- if (WARN_ON(!record->old_roots)) {
+ if (!record->old_roots) {
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
@@ -2787,6 +2797,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
* current root. It's safe inside commit_transaction().
*/
ctx.trans = trans;
+ ctx.time_seq = BTRFS_SEQ_LAST;
ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto cleanup;
@@ -3356,6 +3367,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
int err = -ENOMEM;
int ret = 0;
bool stopped = false;
+ bool did_leaf_rescans = false;
path = btrfs_alloc_path();
if (!path)
@@ -3376,6 +3388,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
}
err = qgroup_rescan_leaf(trans, path);
+ did_leaf_rescans = true;
if (err > 0)
btrfs_commit_transaction(trans);
@@ -3396,16 +3409,23 @@ out:
mutex_unlock(&fs_info->qgroup_rescan_lock);
/*
- * only update status, since the previous part has already updated the
- * qgroup info.
+ * Only update status, since the previous part has already updated the
+ * qgroup info, and only if we did any actual work. This also prevents
+ * race with a concurrent quota disable, which has already set
+ * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
+ * btrfs_quota_disable().
*/
- trans = btrfs_start_transaction(fs_info->quota_root, 1);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ if (did_leaf_rescans) {
+ trans = btrfs_start_transaction(fs_info->quota_root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ btrfs_err(fs_info,
+ "fail to start transaction for status update: %d",
+ err);
+ }
+ } else {
trans = NULL;
- btrfs_err(fs_info,
- "fail to start transaction for status update: %d",
- err);
}
mutex_lock(&fs_info->qgroup_rescan_lock);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 2d90a6b5eb00..6a2cf754912d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -2646,7 +2646,7 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
void **pointers = NULL;
void **unmap_array = NULL;
int sector_nr;
- int ret;
+ int ret = 0;
/*
* @pointers array stores the pointer for each sector.
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 67f7c698ade3..e65e6b6600a7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -486,6 +486,11 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
old_buf_len = p->buf_len;
/*
+ * Allocate to the next largest kmalloc bucket size, to let
+ * the fast path happen most of the time.
+ */
+ len = kmalloc_size_roundup(len);
+ /*
* First time the inline_buf does not suffice
*/
if (p->buf == p->inline_buf) {
@@ -498,11 +503,7 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (!tmp_buf)
return -ENOMEM;
p->buf = tmp_buf;
- /*
- * The real size of the buffer is bigger, this will let the fast path
- * happen most of the time
- */
- p->buf_len = ksize(p->buf);
+ p->buf_len = len;
if (p->reversed) {
tmp_buf = p->buf + old_buf_len - path_len - 1;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d28ee4e36f3d..69c09508afb5 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -407,7 +407,8 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
return 0;
used = btrfs_space_info_used(space_info, true);
- if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
+ if (test_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags) &&
+ (space_info->flags & BTRFS_BLOCK_GROUP_METADATA))
avail = 0;
else
avail = calc_available_free_space(fs_info, space_info, flush);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 93f52ee85f6f..433ce221dc5c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1705,7 +1705,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (ret)
goto restore;
- ret = btrfs_check_features(fs_info, sb);
+ ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY));
if (ret < 0)
goto restore;
@@ -2514,6 +2514,7 @@ static __always_inline void btrfs_exit_btrfs_fs(void)
static void __exit exit_btrfs_fs(void)
{
btrfs_exit_btrfs_fs();
+ btrfs_cleanup_fs_uuids();
}
static int __init init_btrfs_fs(void)
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index a3c43f0b1c95..d43261545264 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2980,7 +2980,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = 0;
if (ret) {
blk_finish_plug(&plug);
- btrfs_abort_transaction(trans, ret);
btrfs_set_log_full_commit(trans);
mutex_unlock(&root->log_mutex);
goto out;
@@ -3045,15 +3044,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_set_log_full_commit(trans);
-
- if (ret != -ENOSPC) {
- btrfs_abort_transaction(trans, ret);
- mutex_unlock(&log_root_tree->log_mutex);
- goto out;
- }
+ if (ret != -ENOSPC)
+ btrfs_err(fs_info,
+ "failed to update log for root %llu ret %d",
+ root->root_key.objectid, ret);
btrfs_wait_tree_log_extents(log, mark);
mutex_unlock(&log_root_tree->log_mutex);
- ret = BTRFS_LOG_FORCE_COMMIT;
goto out;
}
@@ -3112,7 +3108,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out_wake_log_root;
} else if (ret) {
btrfs_set_log_full_commit(trans);
- btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
}
@@ -3826,7 +3821,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
path->slots[0]);
if (tmp.type == BTRFS_DIR_INDEX_KEY)
last_old_dentry_offset = tmp.offset;
+ } else if (ret < 0) {
+ err = ret;
}
+
goto done;
}
@@ -3846,19 +3844,34 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
*/
if (tmp.type == BTRFS_DIR_INDEX_KEY)
last_old_dentry_offset = tmp.offset;
+ } else if (ret < 0) {
+ err = ret;
+ goto done;
}
+
btrfs_release_path(path);
/*
- * Find the first key from this transaction again. See the note for
- * log_new_dir_dentries, if we're logging a directory recursively we
- * won't be holding its i_mutex, which means we can modify the directory
- * while we're logging it. If we remove an entry between our first
- * search and this search we'll not find the key again and can just
- * bail.
+ * Find the first key from this transaction again or the one we were at
+ * in the loop below in case we had to reschedule. We may be logging the
+ * directory without holding its VFS lock, which happen when logging new
+ * dentries (through log_new_dir_dentries()) or in some cases when we
+ * need to log the parent directory of an inode. This means a dir index
+ * key might be deleted from the inode's root, and therefore we may not
+ * find it anymore. If we can't find it, just move to the next key. We
+ * can not bail out and ignore, because if we do that we will simply
+ * not log dir index keys that come after the one that was just deleted
+ * and we can end up logging a dir index range that ends at (u64)-1
+ * (@last_offset is initialized to that), resulting in removing dir
+ * entries we should not remove at log replay time.
*/
search:
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+ if (ret > 0)
+ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ err = ret;
+ /* If ret is 1, there are no more keys in the inode's root. */
if (ret != 0)
goto done;
@@ -5580,8 +5593,10 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
* commits.
*/
- if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
+ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
+ btrfs_set_log_full_commit(trans);
return BTRFS_LOG_FORCE_COMMIT;
+ }
inode = btrfs_iget(root->fs_info->sb, ino, root);
/*
@@ -7459,8 +7474,11 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* not fail, but if it does, it's not serious, just bail out and
* mark the log for a full commit.
*/
- if (WARN_ON_ONCE(ret < 0))
+ if (WARN_ON_ONCE(ret < 0)) {
+ fscrypt_free_filename(&fname);
goto out;
+ }
+
log_pinned = true;
path = btrfs_alloc_path();
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index aa25fa335d3e..bcfef75b97da 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -768,8 +768,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
error = lookup_bdev(path, &path_devt);
- if (error)
+ if (error) {
+ btrfs_err(NULL, "failed to lookup block device for path %s: %d",
+ path, error);
return ERR_PTR(error);
+ }
if (fsid_change_in_progress) {
if (!has_metadata_uuid)
@@ -836,6 +839,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
unsigned int nofs_flag;
if (fs_devices->opened) {
+ btrfs_err(NULL,
+ "device %s belongs to fsid %pU, and the fs is already mounted",
+ path, fs_devices->fsid);
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-EBUSY);
}
@@ -905,6 +911,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* generation are equal.
*/
mutex_unlock(&fs_devices->device_list_mutex);
+ btrfs_err(NULL,
+"device %s already registered with a higher generation, found %llu expect %llu",
+ path, found_transid, device->generation);
return ERR_PTR(-EEXIST);
}
@@ -2005,42 +2014,42 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
return num_devices;
}
+static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
+ struct block_device *bdev, int copy_num)
+{
+ struct btrfs_super_block *disk_super;
+ const size_t len = sizeof(disk_super->magic);
+ const u64 bytenr = btrfs_sb_offset(copy_num);
+ int ret;
+
+ disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
+ if (IS_ERR(disk_super))
+ return;
+
+ memset(&disk_super->magic, 0, len);
+ folio_mark_dirty(virt_to_folio(disk_super));
+ btrfs_release_disk_super(disk_super);
+
+ ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
+ if (ret)
+ btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
+ copy_num, ret);
+}
+
void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
struct block_device *bdev,
const char *device_path)
{
- struct btrfs_super_block *disk_super;
int copy_num;
if (!bdev)
return;
for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
- struct page *page;
- int ret;
-
- disk_super = btrfs_read_dev_one_super(bdev, copy_num, false);
- if (IS_ERR(disk_super))
- continue;
-
- if (bdev_is_zoned(bdev)) {
+ if (bdev_is_zoned(bdev))
btrfs_reset_sb_log_zones(bdev, copy_num);
- continue;
- }
-
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
-
- page = virt_to_page(disk_super);
- set_page_dirty(page);
- lock_page(page);
- /* write_on_page() unlocks the page */
- ret = write_one_page(page);
- if (ret)
- btrfs_warn(fs_info,
- "error clearing superblock number %d (%d)",
- copy_num, ret);
- btrfs_release_disk_super(disk_super);
-
+ else
+ btrfs_scratch_superblock(fs_info, bdev, copy_num);
}
/* Notify udev that device has changed */
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index a759668477bb..1f503e8e42d4 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -539,6 +539,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
}
atomic_set(&zone_info->active_zones_left,
max_active_zones - nactive);
+ /* Overcommit does not work well with active zone tacking. */
+ set_bit(BTRFS_FS_NO_OVERCOMMIT, &fs_info->flags);
}
/* Validate superblock log */
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 61f47debec5a..8c74871e37c9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1367,7 +1367,7 @@ out:
folio_put(folio);
if (check_cap)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY);
return copied;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index e54814d0c2f7..f75ad432f375 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1898,8 +1898,7 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci)
* CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
* further delay.
*/
-void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session)
+void ceph_check_caps(struct ceph_inode_info *ci, int flags)
{
struct inode *inode = &ci->netfs.inode;
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
@@ -1913,15 +1912,14 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
bool queue_invalidate = false;
bool tried_invalidate = false;
bool queue_writeback = false;
-
- if (session)
- ceph_get_mds_session(session);
+ struct ceph_mds_session *session = NULL;
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
+ ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS;
+
/* Don't send messages until we get async create reply */
spin_unlock(&ci->i_ceph_lock);
- ceph_put_mds_session(session);
return;
}
@@ -2851,7 +2849,7 @@ static void check_max_size(struct inode *inode, loff_t endoff)
check = 1;
spin_unlock(&ci->i_ceph_lock);
if (check)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
}
static inline int get_used_fmode(int caps)
@@ -2915,7 +2913,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got
while (true) {
flags &= CEPH_FILE_MODE_MASK;
- if (atomic_read(&fi->num_locks))
+ if (vfs_inode_has_locks(inode))
flags |= CHECK_FILELOCK;
_got = 0;
ret = try_get_cap_refs(inode, need, want, endoff,
@@ -3140,7 +3138,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
switch (mode) {
case PUT_CAP_REFS_SYNC:
if (last)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
else if (flushsnaps)
ceph_flush_snaps(ci, NULL);
break;
@@ -3255,7 +3253,7 @@ unlock:
spin_unlock(&ci->i_ceph_lock);
if (last) {
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
} else if (flush_snaps) {
ceph_flush_snaps(ci, NULL);
}
@@ -3604,10 +3602,9 @@ static void handle_cap_grant(struct inode *inode,
mutex_unlock(&session->s_mutex);
if (check_caps == 1)
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
- session);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL);
else if (check_caps == 2)
- ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
+ ceph_check_caps(ci, CHECK_CAPS_NOINVAL);
}
/*
@@ -4333,7 +4330,7 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
if (inode) {
spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", inode);
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
iput(inode);
spin_lock(&mdsc->cap_delay_lock);
}
@@ -4362,7 +4359,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode));
spin_unlock(&mdsc->cap_dirty_lock);
ceph_wait_on_async_create(inode);
- ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
iput(inode);
spin_lock(&mdsc->cap_dirty_lock);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6f9580defb2b..764598e1efd9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -313,7 +313,7 @@ int ceph_renew_caps(struct inode *inode, int fmode)
spin_unlock(&ci->i_ceph_lock);
dout("renew caps %p want %s issued %s updating mds_wanted\n",
inode, ceph_cap_string(wanted), ceph_cap_string(issued));
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
return 0;
}
spin_unlock(&ci->i_ceph_lock);
@@ -408,7 +408,7 @@ int ceph_open(struct inode *inode, struct file *file)
if ((issued & wanted) != wanted &&
(mds_wanted & wanted) != wanted &&
ceph_snap(inode) != CEPH_SNAPDIR)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
return ceph_init_file(inode, file, fmode);
} else if (ceph_snap(inode) != CEPH_NOSNAP &&
@@ -534,14 +534,23 @@ static void wake_async_create_waiters(struct inode *inode,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ bool check_cap = false;
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
+
+ if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) {
+ ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS;
+ check_cap = true;
+ }
}
ceph_kick_flushing_inode_caps(session, ci);
spin_unlock(&ci->i_ceph_lock);
+
+ if (check_cap)
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
@@ -1092,7 +1101,7 @@ static void ceph_aio_complete(struct inode *inode,
loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
if (endoff > i_size_read(inode)) {
if (ceph_inode_set_size(inode, endoff))
- ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY);
}
spin_lock(&ci->i_ceph_lock);
@@ -1421,8 +1430,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
if (write && pos > size) {
if (ceph_inode_set_size(inode, pos))
ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
+ CHECK_CAPS_AUTHONLY);
}
}
@@ -1577,8 +1585,7 @@ out:
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
ceph_check_caps(ceph_inode(inode),
- CHECK_CAPS_AUTHONLY,
- NULL);
+ CHECK_CAPS_AUTHONLY);
}
}
@@ -1906,7 +1913,7 @@ retry_snap:
if (dirty)
__mark_inode_dirty(inode, dirty);
if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos))
- ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
@@ -2521,8 +2528,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
/* Let the MDS know about dst file size change */
if (ceph_inode_set_size(dst_inode, dst_off) ||
ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
- ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH,
- NULL);
+ ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH);
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f23c5a6edc6f..23d05ec87fcc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1909,7 +1909,7 @@ static void ceph_do_invalidate_pages(struct inode *inode)
mutex_unlock(&ci->i_truncate_mutex);
out:
if (check)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
}
/*
@@ -1969,7 +1969,7 @@ retry:
mutex_unlock(&ci->i_truncate_mutex);
if (wrbuffer_refs == 0)
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
wake_up_all(&ci->i_cap_wq);
}
@@ -1991,7 +1991,7 @@ static void ceph_inode_work(struct work_struct *work)
__ceph_do_pending_vmtruncate(inode);
if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask))
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask))
ceph_flush_snaps(ci, NULL);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 6e061bf62ad4..deac817647eb 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -253,7 +253,7 @@ static long ceph_ioctl_lazyio(struct file *file)
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
- ceph_check_caps(ci, 0, NULL);
+ ceph_check_caps(ci, 0);
} else {
dout("ioctl_layzio: file %p already lazy\n", file);
}
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index f3b461c708a8..9c8dc8a55e7e 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -32,24 +32,36 @@ void __init ceph_flock_init(void)
static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
{
- struct ceph_file_info *fi = dst->fl_file->private_data;
struct inode *inode = file_inode(dst->fl_file);
atomic_inc(&ceph_inode(inode)->i_filelock_ref);
- atomic_inc(&fi->num_locks);
+ dst->fl_u.ceph.inode = igrab(inode);
}
+/*
+ * Do not use the 'fl->fl_file' in release function, which
+ * is possibly already released by another thread.
+ */
static void ceph_fl_release_lock(struct file_lock *fl)
{
- struct ceph_file_info *fi = fl->fl_file->private_data;
- struct inode *inode = file_inode(fl->fl_file);
- struct ceph_inode_info *ci = ceph_inode(inode);
- atomic_dec(&fi->num_locks);
+ struct inode *inode = fl->fl_u.ceph.inode;
+ struct ceph_inode_info *ci;
+
+ /*
+ * If inode is NULL it should be a request file_lock,
+ * nothing we can do.
+ */
+ if (!inode)
+ return;
+
+ ci = ceph_inode(inode);
if (atomic_dec_and_test(&ci->i_filelock_ref)) {
/* clear error when all locks are released */
spin_lock(&ci->i_ceph_lock);
ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK;
spin_unlock(&ci->i_ceph_lock);
}
+ fl->fl_u.ceph.inode = NULL;
+ iput(inode);
}
static const struct file_lock_operations ceph_fl_lock_ops = {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 50e57a1fa32f..0ed3be75bb9a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -593,6 +593,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */
#define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT)
#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */
+#define CEPH_I_ASYNC_CHECK_CAPS (1 << 14) /* check caps immediately after async
+ creating finishes */
/*
* Masks of ceph inode work.
@@ -788,7 +790,6 @@ struct ceph_file_info {
struct list_head rw_contexts;
u32 filp_gen;
- atomic_t num_locks;
};
struct ceph_dir_file_info {
@@ -1200,8 +1201,7 @@ extern void ceph_remove_capsnap(struct inode *inode,
extern void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession);
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
-extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
- struct ceph_mds_session *session);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags);
extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
extern int ceph_drop_caps_for_unlink(struct inode *inode);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index ba0ded7842a7..13deb45f1ec6 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -483,17 +483,24 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count)
p->dev = dev;
p->count = count;
- if (WARN_ON(dev == WHITEOUT_DEV))
- return -EBUSY;
+ if (WARN_ON(dev == WHITEOUT_DEV)) {
+ error = -EBUSY;
+ goto err;
+ }
error = kobj_map(cdev_map, dev, count, NULL,
exact_match, exact_lock, p);
if (error)
- return error;
+ goto err;
kobject_get(p->kobj.parent);
return 0;
+
+err:
+ kfree_const(p->kobj.name);
+ p->kobj.name = NULL;
+ return error;
}
/**
@@ -547,7 +554,7 @@ int cdev_device_add(struct cdev *cdev, struct device *dev)
}
rc = device_add(dev);
- if (rc)
+ if (rc && dev->devt)
cdev_del(cdev);
return rc;
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 7c9785973f49..304a7f6cc13a 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -21,7 +21,7 @@ cifs-$(CONFIG_CIFS_XATTR) += xattr.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
-cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o
+cifs-$(CONFIG_CIFS_DFS_UPCALL) += cifs_dfs_ref.o dfs_cache.o dfs.o
cifs-$(CONFIG_CIFS_SWN_UPCALL) += netlink.o cifs_swn.o
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 90850da390ae..56b23def4c95 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -372,6 +372,14 @@ skip_rdma:
seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d",
atomic_read(&server->in_send),
atomic_read(&server->num_waiters));
+ if (IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)) {
+ if (server->origin_fullpath)
+ seq_printf(m, "\nDFS origin full path: %s",
+ server->origin_fullpath);
+ if (server->leaf_fullpath)
+ seq_printf(m, "\nDFS leaf full path: %s",
+ server->leaf_fullpath);
+ }
seq_printf(m, "\n\n\tSessions: ");
i = 0;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index b0864da9ef43..2b1a8d55b4ec 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -21,8 +21,7 @@
#include "cifsfs.h"
#include "dns_resolve.h"
#include "cifs_debug.h"
-#include "cifs_unicode.h"
-#include "dfs_cache.h"
+#include "dfs.h"
#include "fs_context.h"
static LIST_HEAD(cifs_dfs_automount_list);
@@ -60,7 +59,7 @@ void cifs_dfs_release_automount_timer(void)
* Returns pointer to the built string, or a ERR_PTR. Caller is responsible
* for freeing the returned string.
*/
-static char *
+char *
cifs_build_devname(char *nodename, const char *prepath)
{
size_t pplen;
@@ -119,200 +118,34 @@ cifs_build_devname(char *nodename, const char *prepath)
return dev;
}
-
-/**
- * cifs_compose_mount_options - creates mount options for referral
- * @sb_mountdata: parent/root DFS mount options (template)
- * @fullpath: full path in UNC format
- * @ref: optional server's referral
- * @devname: return the built cifs device name if passed pointer not NULL
- * creates mount options for submount based on template options sb_mountdata
- * and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
- *
- * Returns: pointer to new mount options or ERR_PTR.
- * Caller is responsible for freeing returned value if it is not error.
- */
-char *cifs_compose_mount_options(const char *sb_mountdata,
- const char *fullpath,
- const struct dfs_info3_param *ref,
- char **devname)
+static int set_dest_addr(struct smb3_fs_context *ctx, const char *full_path)
{
+ struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
int rc;
- char *name;
- char *mountdata = NULL;
- const char *prepath = NULL;
- int md_len;
- char *tkn_e;
- char *srvIP = NULL;
- char sep = ',';
- int off, noff;
-
- if (sb_mountdata == NULL)
- return ERR_PTR(-EINVAL);
-
- if (ref) {
- if (WARN_ON_ONCE(!ref->node_name || ref->path_consumed < 0))
- return ERR_PTR(-EINVAL);
-
- if (strlen(fullpath) - ref->path_consumed) {
- prepath = fullpath + ref->path_consumed;
- /* skip initial delimiter */
- if (*prepath == '/' || *prepath == '\\')
- prepath++;
- }
-
- name = cifs_build_devname(ref->node_name, prepath);
- if (IS_ERR(name)) {
- rc = PTR_ERR(name);
- name = NULL;
- goto compose_mount_options_err;
- }
- } else {
- name = cifs_build_devname((char *)fullpath, NULL);
- if (IS_ERR(name)) {
- rc = PTR_ERR(name);
- name = NULL;
- goto compose_mount_options_err;
- }
- }
-
- rc = dns_resolve_server_name_to_ip(name, &srvIP, NULL);
- if (rc < 0) {
- cifs_dbg(FYI, "%s: Failed to resolve server part of %s to IP: %d\n",
- __func__, name, rc);
- goto compose_mount_options_err;
- }
-
- /*
- * In most cases, we'll be building a shorter string than the original,
- * but we do have to assume that the address in the ip= option may be
- * much longer than the original. Add the max length of an address
- * string to the length of the original string to allow for worst case.
- */
- md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
- mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL);
- if (mountdata == NULL) {
- rc = -ENOMEM;
- goto compose_mount_options_err;
- }
-
- /* copy all options except of unc,ip,prefixpath */
- off = 0;
- if (strncmp(sb_mountdata, "sep=", 4) == 0) {
- sep = sb_mountdata[4];
- strncpy(mountdata, sb_mountdata, 5);
- off += 5;
- }
-
- do {
- tkn_e = strchr(sb_mountdata + off, sep);
- if (tkn_e == NULL)
- noff = strlen(sb_mountdata + off);
- else
- noff = tkn_e - (sb_mountdata + off) + 1;
-
- if (strncasecmp(sb_mountdata + off, "cruid=", 6) == 0) {
- off += noff;
- continue;
- }
- if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) {
- off += noff;
- continue;
- }
- if (strncasecmp(sb_mountdata + off, "ip=", 3) == 0) {
- off += noff;
- continue;
- }
- if (strncasecmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
- off += noff;
- continue;
- }
- strncat(mountdata, sb_mountdata + off, noff);
- off += noff;
- } while (tkn_e);
- strcat(mountdata, sb_mountdata + off);
- mountdata[md_len] = '\0';
-
- /* copy new IP and ref share name */
- if (mountdata[strlen(mountdata) - 1] != sep)
- strncat(mountdata, &sep, 1);
- strcat(mountdata, "ip=");
- strcat(mountdata, srvIP);
-
- if (devname)
- *devname = name;
- else
- kfree(name);
-
- /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
- /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
-compose_mount_options_out:
- kfree(srvIP);
- return mountdata;
-
-compose_mount_options_err:
- kfree(mountdata);
- mountdata = ERR_PTR(rc);
- kfree(name);
- goto compose_mount_options_out;
-}
-
-/**
- * cifs_dfs_do_mount - mounts specified path using DFS full path
- *
- * Always pass down @fullpath to smb3_do_mount() so we can use the root server
- * to perform failover in case we failed to connect to the first target in the
- * referral.
- *
- * @mntpt: directory entry for the path we are trying to automount
- * @cifs_sb: parent/root superblock
- * @fullpath: full path in UNC format
- */
-static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt,
- struct cifs_sb_info *cifs_sb,
- const char *fullpath)
-{
- struct vfsmount *mnt;
- char *mountdata;
- char *devname;
-
- devname = kstrdup(fullpath, GFP_KERNEL);
- if (!devname)
- return ERR_PTR(-ENOMEM);
-
- convert_delimiter(devname, '/');
-
- /* TODO: change to call fs_context_for_mount(), fill in context directly, call fc_mount */
-
- /* See afs_mntpt_do_automount in fs/afs/mntpt.c for an example */
-
- /* strip first '\' from fullpath */
- mountdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options,
- fullpath + 1, NULL, NULL);
- if (IS_ERR(mountdata)) {
- kfree(devname);
- return (struct vfsmount *)mountdata;
- }
-
- mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
- kfree(mountdata);
- kfree(devname);
- return mnt;
+ rc = dns_resolve_server_name_to_ip(full_path, addr, NULL);
+ if (!rc)
+ cifs_set_port(addr, ctx->port);
+ return rc;
}
/*
* Create a vfsmount that we can automount
*/
-static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
+static struct vfsmount *cifs_dfs_do_automount(struct path *path)
{
+ int rc;
+ struct dentry *mntpt = path->dentry;
+ struct fs_context *fc;
struct cifs_sb_info *cifs_sb;
- void *page;
+ void *page = NULL;
+ struct smb3_fs_context *ctx, *cur_ctx;
+ struct smb3_fs_context tmp;
char *full_path;
struct vfsmount *mnt;
- cifs_dbg(FYI, "in %s\n", __func__);
- BUG_ON(IS_ROOT(mntpt));
+ if (IS_ROOT(mntpt))
+ return ERR_PTR(-ESTALE);
/*
* The MSDFS spec states that paths in DFS referral requests and
@@ -321,29 +154,53 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
* gives us the latter, so we must adjust the result.
*/
cifs_sb = CIFS_SB(mntpt->d_sb);
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) {
- mnt = ERR_PTR(-EREMOTE);
- goto cdda_exit;
- }
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)
+ return ERR_PTR(-EREMOTE);
+
+ cur_ctx = cifs_sb->ctx;
+
+ fc = fs_context_for_submount(path->mnt->mnt_sb->s_type, mntpt);
+ if (IS_ERR(fc))
+ return ERR_CAST(fc);
+
+ ctx = smb3_fc2context(fc);
page = alloc_dentry_path();
- /* always use tree name prefix */
- full_path = build_path_from_dentry_optional_prefix(mntpt, page, true);
+ full_path = dfs_get_automount_devname(mntpt, page);
if (IS_ERR(full_path)) {
mnt = ERR_CAST(full_path);
- goto free_full_path;
+ goto out;
}
- convert_delimiter(full_path, '\\');
+ convert_delimiter(full_path, '/');
cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
- mnt = cifs_dfs_do_mount(mntpt, cifs_sb, full_path);
- cifs_dbg(FYI, "%s: cifs_dfs_do_mount:%s , mnt:%p\n", __func__, full_path + 1, mnt);
+ tmp = *cur_ctx;
+ tmp.source = full_path;
+ tmp.leaf_fullpath = NULL;
+ tmp.UNC = tmp.prepath = NULL;
+
+ rc = smb3_fs_context_dup(ctx, &tmp);
+ if (rc) {
+ mnt = ERR_PTR(rc);
+ goto out;
+ }
+
+ rc = set_dest_addr(ctx, full_path);
+ if (rc) {
+ mnt = ERR_PTR(rc);
+ goto out;
+ }
+
+ rc = smb3_parse_devname(full_path, ctx);
+ if (!rc)
+ mnt = fc_mount(fc);
+ else
+ mnt = ERR_PTR(rc);
-free_full_path:
+out:
+ put_fs_context(fc);
free_dentry_path(page);
-cdda_exit:
- cifs_dbg(FYI, "leaving %s\n" , __func__);
return mnt;
}
@@ -354,9 +211,9 @@ struct vfsmount *cifs_dfs_d_automount(struct path *path)
{
struct vfsmount *newmnt;
- cifs_dbg(FYI, "in %s\n", __func__);
+ cifs_dbg(FYI, "%s: %pd\n", __func__, path->dentry);
- newmnt = cifs_dfs_do_automount(path->dentry);
+ newmnt = cifs_dfs_do_automount(path);
if (IS_ERR(newmnt)) {
cifs_dbg(FYI, "leaving %s [automount failed]\n" , __func__);
return newmnt;
diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h
index d86d78d5bfdc..332588e77c31 100644
--- a/fs/cifs/cifs_ioctl.h
+++ b/fs/cifs/cifs_ioctl.h
@@ -108,7 +108,7 @@ struct smb3_notify_info {
#define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify)
#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info)
#define CIFS_IOC_NOTIFY_INFO _IOWR(CIFS_IOCTL_MAGIC, 11, struct smb3_notify_info)
-#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32)
+#define CIFS_IOC_SHUTDOWN _IOR('X', 125, __u32)
/*
* Flags for going down operation
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 342717bf1dc2..6f3285f1dfee 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -189,7 +189,7 @@ init_cifs_spnego(void)
* spnego upcalls.
*/
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index c647f0d56518..bbf58c2439da 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -470,7 +470,7 @@ init_cifs_idmap(void)
* this is used to prevent malicious redirections from being installed
* with add_key().
*/
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5db73c0f792a..cbc18b4a9cb2 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -278,6 +278,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
* ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
* unicode length of a netbios domain name
*/
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.len = size + 2 * dlen;
ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
if (!ses->auth_key.response) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 040267ed8a64..10e00c624922 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -678,9 +678,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
- /* Only display max_credits if it was overridden on mount */
+ /* Only display the following if overridden on mount */
if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE)
seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits);
+ if (tcon->ses->server->tcp_nodelay)
+ seq_puts(s, ",tcpnodelay");
+ if (tcon->ses->server->noautotune)
+ seq_puts(s, ",noautotune");
+ if (tcon->ses->server->noblocksnd)
+ seq_puts(s, ",noblocksend");
if (tcon->snapshot_time)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
@@ -890,12 +896,6 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
goto out;
}
- rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, NULL);
- if (rc) {
- root = ERR_PTR(rc);
- goto out;
- }
-
rc = cifs_setup_cifs_sb(cifs_sb);
if (rc) {
root = ERR_PTR(rc);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 388b745a978e..63a0ac2b9355 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -105,8 +105,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_flush(struct file *, fl_owner_t id);
-extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
-extern int cifs_file_strict_mmap(struct file * , struct vm_area_struct *);
+extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma);
+extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
@@ -153,6 +153,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 40
-#define CIFS_VERSION "2.40"
+#define SMB3_PRODUCT_BUILD 41
+#define CIFS_VERSION "2.41"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1420acf987f0..cfdd5bf701a1 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -13,6 +13,8 @@
#include <linux/in6.h>
#include <linux/inet.h>
#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/mm.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/utsname.h>
@@ -21,7 +23,6 @@
#include "cifs_fs_sb.h"
#include "cifsacl.h"
#include <crypto/internal/hash.h>
-#include <linux/scatterlist.h>
#include <uapi/linux/cifs/cifs_mount.h>
#include "../smbfs_common/smb2pdu.h"
#include "smb2pdu.h"
@@ -106,6 +107,8 @@
#define CIFS_MAX_WORKSTATION_LEN (__NEW_UTS_LEN + 1) /* reasonable max for client */
+#define CIFS_DFS_ROOT_SES(ses) ((ses)->dfs_root_ses ?: (ses))
+
/*
* CIFS vfs client Status information (based on what we know.)
*/
@@ -737,8 +740,6 @@ struct TCP_Server_Info {
bool use_swn_dstaddr;
struct sockaddr_storage swn_dstaddr;
#endif
-#ifdef CONFIG_CIFS_DFS_UPCALL
- bool is_dfs_conn; /* if a dfs connection */
struct mutex refpath_lock; /* protects leaf_fullpath */
/*
* Canonical DFS full paths that were used to chase referrals in mount and reconnect.
@@ -752,7 +753,6 @@ struct TCP_Server_Info {
* format: \\HOST\SHARE\[OPTIONAL PATH]
*/
char *origin_fullpath, *leaf_fullpath, *current_fullpath;
-#endif
};
static inline bool is_smb1(struct TCP_Server_Info *server)
@@ -785,6 +785,7 @@ static inline unsigned int
in_flight(struct TCP_Server_Info *server)
{
unsigned int num;
+
spin_lock(&server->req_lock);
num = server->in_flight;
spin_unlock(&server->req_lock);
@@ -795,6 +796,7 @@ static inline bool
has_credits(struct TCP_Server_Info *server, int *credits, int num_credits)
{
int num;
+
spin_lock(&server->req_lock);
num = *credits;
spin_unlock(&server->req_lock);
@@ -1025,7 +1027,7 @@ struct cifs_ses {
struct TCP_Server_Info *server; /* pointer to server info */
int ses_count; /* reference counter */
enum ses_status_enum ses_status; /* updates protected by cifs_tcp_ses_lock */
- unsigned overrideSecFlg; /* if non-zero override global sec flags */
+ unsigned int overrideSecFlg; /* if non-zero override global sec flags */
char *serverOS; /* name of operating system underlying server */
char *serverNOS; /* name of network operating system of server */
char *serverDomain; /* security realm of server */
@@ -1099,6 +1101,7 @@ struct cifs_ses {
*/
unsigned long chans_need_reconnect;
/* ========= end: protected by chan_lock ======== */
+ struct cifs_ses *dfs_root_ses;
};
static inline bool
@@ -1381,7 +1384,7 @@ struct cifsFileInfo {
__u32 pid; /* process id who opened file */
struct cifs_fid fid; /* file id from remote */
struct list_head rlist; /* reconnect list */
- /* BB add lock scope info here if needed */ ;
+ /* BB add lock scope info here if needed */
/* lock scope id (0 if none) */
struct dentry *dentry;
struct tcon_link *tlink;
@@ -1757,6 +1760,18 @@ struct file_list {
struct cifsFileInfo *cfile;
};
+struct cifs_mount_ctx {
+ struct cifs_sb_info *cifs_sb;
+ struct smb3_fs_context *fs_ctx;
+ unsigned int xid;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+ struct cifs_ses *root_ses;
+ uuid_t mount_id;
+ char *origin_fullpath, *leaf_fullpath;
+};
+
static inline void free_dfs_info_param(struct dfs_info3_param *param)
{
if (param) {
@@ -1769,6 +1784,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
int number_of_items)
{
int i;
+
if ((number_of_items == 0) || (param == NULL))
return;
for (i = 0; i < number_of_items; i++) {
@@ -2137,4 +2153,70 @@ static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const
dst->FileNameLength = src->FileNameLength;
}
+static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
+ int num_rqst,
+ const u8 *sig)
+{
+ unsigned int len, skip;
+ unsigned int nents = 0;
+ unsigned long addr;
+ int i, j;
+
+ /* Assumes the first rqst has a transform header as the first iov.
+ * I.e.
+ * rqst[0].rq_iov[0] is transform header
+ * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+ * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ */
+ for (i = 0; i < num_rqst; i++) {
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
+ */
+ for (j = 0; j < rqst[i].rq_nvec; j++) {
+ struct kvec *iov = &rqst[i].rq_iov[j];
+
+ skip = (i == 0) && (j == 0) ? 20 : 0;
+ addr = (unsigned long)iov->iov_base + skip;
+ if (unlikely(is_vmalloc_addr((void *)addr))) {
+ len = iov->iov_len - skip;
+ nents += DIV_ROUND_UP(offset_in_page(addr) + len,
+ PAGE_SIZE);
+ } else {
+ nents++;
+ }
+ }
+ nents += rqst[i].rq_npages;
+ }
+ nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
+ return nents;
+}
+
+/* We can not use the normal sg_set_buf() as we will sometimes pass a
+ * stack object as buf.
+ */
+static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg,
+ const void *buf,
+ unsigned int buflen)
+{
+ unsigned long addr = (unsigned long)buf;
+ unsigned int off = offset_in_page(addr);
+
+ addr &= PAGE_MASK;
+ if (unlikely(is_vmalloc_addr((void *)addr))) {
+ do {
+ unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off);
+
+ sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off);
+
+ off = 0;
+ addr += PAGE_SIZE;
+ buflen -= len;
+ } while (buflen);
+ } else {
+ sg_set_page(sg++, virt_to_page(addr), buflen, off);
+ }
+ return sg;
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index d1abaeea974a..623caece2b10 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1429,7 +1429,7 @@ typedef struct smb_com_transaction_change_notify_req {
__u8 WatchTree; /* 1 = Monitor subdirectories */
__u8 Reserved2;
__le16 ByteCount;
-/* __u8 Pad[3];*/
+/* __u8 Pad[3];*/
/* __u8 Data[1];*/
} __attribute__((packed)) TRANSACT_CHANGE_NOTIFY_REQ;
@@ -1752,8 +1752,7 @@ struct smb_com_transaction2_sfi_rsp {
struct smb_hdr hdr; /* wct = 10 + SetupCount */
struct trans2_resp t2;
__u16 ByteCount;
- __u16 Reserved2; /* parameter word reserved -
- present for infolevels > 100 */
+ __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */
} __attribute__((packed));
struct smb_t2_qfi_req {
@@ -1768,8 +1767,7 @@ struct smb_t2_qfi_rsp {
struct smb_hdr hdr; /* wct = 10 + SetupCount */
struct trans2_resp t2;
__u16 ByteCount;
- __u16 Reserved2; /* parameter word reserved -
- present for infolevels > 100 */
+ __u16 Reserved2; /* parameter word reserved - present for infolevels > 100 */
} __attribute__((packed));
/*
@@ -2146,13 +2144,11 @@ typedef struct {
#define CIFS_UNIX_POSIX_PATH_OPS_CAP 0x00000020 /* Allow new POSIX path based
calls including posix open
and posix unlink */
-#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up
- to 0xFFFF00 */
+#define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up to 0xFFFF00 */
#define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080
#define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */
#define CIFS_UNIX_TRANSPORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */
-#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and
- QFS PROXY call */
+#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and QFS PROXY call */
#ifdef CONFIG_CIFS_POSIX
/* presumably don't need the 0x20 POSIX_PATH_OPS_CAP since we never send
LockingX instead of posix locking call on unix sess (and we do not expect
@@ -2368,8 +2364,7 @@ typedef struct {
struct file_allocation_info {
__le64 AllocationSize; /* Note old Samba srvr rounds this up too much */
-} __attribute__((packed)); /* size used on disk, for level 0x103 for set,
- 0x105 for query */
+} __packed; /* size used on disk, for level 0x103 for set, 0x105 for query */
struct file_end_of_file_info {
__le64 FileSize; /* offset to end of file */
@@ -2409,8 +2404,7 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
__le16 access_entry_count; /* access ACL - count of entries */
__le16 default_entry_count; /* default ACL - count of entries */
struct cifs_posix_ace ace_array[];
- /* followed by
- struct cifs_posix_ace default_ace_arraay[] */
+ /* followed by struct cifs_posix_ace default_ace_array[] */
} __attribute__((packed)); /* level 0x204 */
/* types of access control entries already defined in posix_acl.h */
@@ -2429,17 +2423,17 @@ struct cifs_posix_acl { /* access conrol list (ACL) */
/* end of POSIX ACL definitions */
/* POSIX Open Flags */
-#define SMB_O_RDONLY 0x1
-#define SMB_O_WRONLY 0x2
-#define SMB_O_RDWR 0x4
-#define SMB_O_CREAT 0x10
-#define SMB_O_EXCL 0x20
-#define SMB_O_TRUNC 0x40
-#define SMB_O_APPEND 0x80
-#define SMB_O_SYNC 0x100
-#define SMB_O_DIRECTORY 0x200
-#define SMB_O_NOFOLLOW 0x400
-#define SMB_O_DIRECT 0x800
+#define SMB_O_RDONLY 0x1
+#define SMB_O_WRONLY 0x2
+#define SMB_O_RDWR 0x4
+#define SMB_O_CREAT 0x10
+#define SMB_O_EXCL 0x20
+#define SMB_O_TRUNC 0x40
+#define SMB_O_APPEND 0x80
+#define SMB_O_SYNC 0x100
+#define SMB_O_DIRECTORY 0x200
+#define SMB_O_NOFOLLOW 0x400
+#define SMB_O_DIRECT 0x800
typedef struct {
__le32 OpenFlags; /* same as NT CreateX */
@@ -2716,15 +2710,13 @@ typedef struct file_xattr_info {
__u32 xattr_value_len;
char xattr_name[];
/* followed by xattr_value[xattr_value_len], no pad */
-} __attribute__((packed)) FILE_XATTR_INFO; /* extended attribute info
- level 0x205 */
+} __packed FILE_XATTR_INFO; /* extended attribute info level 0x205 */
/* flags for lsattr and chflags commands removed arein uapi/linux/fs.h */
typedef struct file_chattr_info {
__le64 mask; /* list of all possible attribute bits */
__le64 mode; /* list of actual attribute bits on this inode */
-} __attribute__((packed)) FILE_CHATTR_INFO; /* ext attributes
- (chattr, chflags) level 0x206 */
-#endif /* POSIX */
+} __packed FILE_CHATTR_INFO; /* ext attributes (chattr, chflags) level 0x206 */
+#endif /* POSIX */
#endif /* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f50f96e4ec30..1207b39686fb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -57,6 +57,9 @@ extern void exit_cifs_idmap(void);
extern int init_cifs_spnego(void);
extern void exit_cifs_spnego(void);
extern const char *build_path_from_dentry(struct dentry *, void *);
+char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ const char *tree, int tree_len,
+ bool prefix);
extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry,
void *page, bool prefix);
static inline void *alloc_dentry_path(void)
@@ -75,9 +78,7 @@ extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx,
struct cifs_tcon *tcon,
int add_treename);
extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
-extern char *cifs_compose_mount_options(const char *sb_mountdata,
- const char *fullpath, const struct dfs_info3_param *ref,
- char **devname);
+char *cifs_build_devname(char *nodename, const char *prepath);
extern void delete_mid(struct mid_q_entry *mid);
extern void release_mid(struct mid_q_entry *mid);
extern void cifs_wake_up_task(struct mid_q_entry *mid);
@@ -124,7 +125,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec * /* resp vec */);
extern int SendReceiveBlockingLock(const unsigned int xid,
struct cifs_tcon *ptcon,
- struct smb_hdr *in_buf ,
+ struct smb_hdr *in_buf,
struct smb_hdr *out_buf,
int *bytes_returned);
void
@@ -244,6 +245,10 @@ extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
unsigned int page_offset,
unsigned int to_read);
extern int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb);
+void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx);
+int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx);
+int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx);
+int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx);
extern int cifs_match_super(struct super_block *, void *);
extern int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx);
extern void cifs_umount(struct cifs_sb_info *);
@@ -561,9 +566,6 @@ extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
const struct nls_table *codepage);
-extern int
-cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname);
-
extern struct TCP_Server_Info *
cifs_find_tcp_session(struct smb3_fs_context *ctx);
@@ -604,8 +606,8 @@ int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw);
int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
void cifs_free_hash(struct shash_desc **sdesc);
-extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset);
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+ unsigned int *len, unsigned int *offset);
struct cifs_chan *
cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e80252a83225..b2a04b4e89a5 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -46,6 +46,7 @@
#include "smbdirect.h"
#include "dns_resolve.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
+#include "dfs.h"
#include "dfs_cache.h"
#endif
#include "fs_context.h"
@@ -61,20 +62,6 @@ extern bool disable_legacy_dialects;
/* Drop the connection to not overload the server */
#define NUM_STATUS_IO_TIMEOUT 5
-struct mount_ctx {
- struct cifs_sb_info *cifs_sb;
- struct smb3_fs_context *fs_ctx;
- unsigned int xid;
- struct TCP_Server_Info *server;
- struct cifs_ses *ses;
- struct cifs_tcon *tcon;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- struct cifs_ses *root_ses;
- uuid_t mount_id;
- char *origin_fullpath, *leaf_fullpath;
-#endif
-};
-
static int ip_connect(struct TCP_Server_Info *server);
static int generic_ip_connect(struct TCP_Server_Info *server);
static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink);
@@ -90,7 +77,8 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
{
int rc;
int len;
- char *unc, *ipaddr = NULL;
+ char *unc;
+ struct sockaddr_storage ss;
time64_t expiry, now;
unsigned long ttl = SMB_DNS_RESOLVE_INTERVAL_DEFAULT;
@@ -110,7 +98,11 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
}
scnprintf(unc, len, "\\\\%s", server->hostname);
- rc = dns_resolve_server_name_to_ip(unc, &ipaddr, &expiry);
+ spin_lock(&server->srv_lock);
+ ss = server->dstaddr;
+ spin_unlock(&server->srv_lock);
+
+ rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, &expiry);
kfree(unc);
if (rc < 0) {
@@ -120,22 +112,13 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
}
spin_lock(&server->srv_lock);
- rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
- strlen(ipaddr));
+ memcpy(&server->dstaddr, &ss, sizeof(server->dstaddr));
spin_unlock(&server->srv_lock);
- kfree(ipaddr);
- /* rc == 1 means success here */
- if (rc) {
- now = ktime_get_real_seconds();
- if (expiry && expiry > now)
- /*
- * To make sure we don't use the cached entry, retry 1s
- * after expiry.
- */
- ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1;
- }
- rc = !rc ? -1 : 0;
+ now = ktime_get_real_seconds();
+ if (expiry && expiry > now)
+ /* To make sure we don't use the cached entry, retry 1s */
+ ttl = max_t(unsigned long, expiry - now, SMB_DNS_RESOLVE_INTERVAL_MIN) + 1;
requeue_resolve:
cifs_dbg(FYI, "%s: next dns resolution scheduled for %lu seconds in the future\n",
@@ -279,8 +262,10 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
tcon->need_reconnect = true;
tcon->status = TID_NEED_RECON;
}
- if (ses->tcon_ipc)
+ if (ses->tcon_ipc) {
ses->tcon_ipc->need_reconnect = true;
+ ses->tcon_ipc->status = TID_NEED_RECON;
+ }
next_session:
spin_unlock(&ses->chan_lock);
@@ -546,9 +531,7 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
} while (server->tcpStatus == CifsNeedReconnect);
- if (target_hint)
- dfs_cache_noreq_update_tgthint(refpath, target_hint);
-
+ dfs_cache_noreq_update_tgthint(refpath, target_hint);
dfs_cache_free_tgts(&tl);
/* Need to set up echo worker again once connection has been established */
@@ -563,16 +546,8 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
{
- /* If tcp session is not an dfs connection, then reconnect to last target server */
- spin_lock(&server->srv_lock);
- if (!server->is_dfs_conn) {
- spin_unlock(&server->srv_lock);
- return __cifs_reconnect(server, mark_smb_session);
- }
- spin_unlock(&server->srv_lock);
-
mutex_lock(&server->refpath_lock);
- if (!server->origin_fullpath || !server->leaf_fullpath) {
+ if (!server->leaf_fullpath) {
mutex_unlock(&server->refpath_lock);
return __cifs_reconnect(server, mark_smb_session);
}
@@ -1384,9 +1359,7 @@ match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
return port == *sport;
}
-static bool
-match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
- struct sockaddr *srcaddr)
+static bool match_server_address(struct TCP_Server_Info *server, struct sockaddr *addr)
{
switch (addr->sa_family) {
case AF_INET: {
@@ -1415,9 +1388,6 @@ match_address(struct TCP_Server_Info *server, struct sockaddr *addr,
return false; /* don't expect to be here */
}
- if (!cifs_match_ipaddr(srcaddr, (struct sockaddr *)&server->srcaddr))
- return false;
-
return true;
}
@@ -1444,8 +1414,23 @@ match_security(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
return true;
}
+static bool dfs_src_pathname_equal(const char *s1, const char *s2)
+{
+ if (strlen(s1) != strlen(s2))
+ return false;
+ for (; *s1; s1++, s2++) {
+ if (*s1 == '/' || *s1 == '\\') {
+ if (*s2 != '/' && *s2 != '\\')
+ return false;
+ } else if (tolower(*s1) != tolower(*s2))
+ return false;
+ }
+ return true;
+}
+
/* this function must be called with srv_lock held */
-static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
+static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *ctx,
+ bool dfs_super_cmp)
{
struct sockaddr *addr = (struct sockaddr *)&ctx->dstaddr;
@@ -1470,15 +1455,30 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context *
if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns))
return 0;
- if (strcasecmp(server->hostname, ctx->server_hostname))
- return 0;
-
- if (!match_address(server, addr,
- (struct sockaddr *)&ctx->srcaddr))
- return 0;
-
- if (!match_port(server, addr))
+ if (!cifs_match_ipaddr((struct sockaddr *)&ctx->srcaddr,
+ (struct sockaddr *)&server->srcaddr))
return 0;
+ /*
+ * When matching DFS superblocks, we only check for original source pathname as the
+ * currently connected target might be different than the one parsed earlier in i.e.
+ * mount.cifs(8).
+ */
+ if (dfs_super_cmp) {
+ if (!ctx->source || !server->origin_fullpath ||
+ !dfs_src_pathname_equal(server->origin_fullpath, ctx->source))
+ return 0;
+ } else {
+ /* Skip addr, hostname and port matching for DFS connections */
+ if (server->leaf_fullpath) {
+ if (!ctx->leaf_fullpath ||
+ strcasecmp(server->leaf_fullpath, ctx->leaf_fullpath))
+ return 0;
+ } else if (strcasecmp(server->hostname, ctx->server_hostname) ||
+ !match_server_address(server, addr) ||
+ !match_port(server, addr)) {
+ return 0;
+ }
+ }
if (!match_security(server, ctx))
return 0;
@@ -1506,23 +1506,11 @@ cifs_find_tcp_session(struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
spin_lock(&server->srv_lock);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- /*
- * DFS failover implementation in cifs_reconnect() requires unique tcp sessions for
- * DFS connections to do failover properly, so avoid sharing them with regular
- * shares or even links that may connect to same server but having completely
- * different failover targets.
- */
- if (server->is_dfs_conn) {
- spin_unlock(&server->srv_lock);
- continue;
- }
-#endif
/*
* Skip ses channels since they're only handled in lower layers
* (e.g. cifs_send_recv).
*/
- if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx)) {
+ if (CIFS_SERVER_IS_CHAN(server) || !match_server(server, ctx, false)) {
spin_unlock(&server->srv_lock);
continue;
}
@@ -1617,6 +1605,15 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
goto out_err;
}
+ if (ctx->leaf_fullpath) {
+ tcp_ses->leaf_fullpath = kstrdup(ctx->leaf_fullpath, GFP_KERNEL);
+ if (!tcp_ses->leaf_fullpath) {
+ rc = -ENOMEM;
+ goto out_err;
+ }
+ tcp_ses->current_fullpath = tcp_ses->leaf_fullpath;
+ }
+
if (ctx->nosharesock)
tcp_ses->nosharesock = true;
@@ -1765,6 +1762,7 @@ out_err:
if (CIFS_SERVER_IS_CHAN(tcp_ses))
cifs_put_tcp_session(tcp_ses->primary_server, false);
kfree(tcp_ses->hostname);
+ kfree(tcp_ses->leaf_fullpath);
if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
kfree(tcp_ses);
@@ -1871,6 +1869,9 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx)
cifs_dbg(FYI, "IPC tcon rc=%d ipc tid=0x%x\n", rc, tcon->tid);
+ spin_lock(&tcon->tc_lock);
+ tcon->status = TID_GOOD;
+ spin_unlock(&tcon->tc_lock);
ses->tcon_ipc = tcon;
out:
return rc;
@@ -2157,7 +2158,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx __attribute__((unused)),
struct cifs_ses *
cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
{
- int rc = -ENOMEM;
+ int rc = 0;
unsigned int xid;
struct cifs_ses *ses;
struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
@@ -2206,6 +2207,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
return ses;
}
+ rc = -ENOMEM;
+
cifs_dbg(FYI, "Existing smb sess not found\n");
ses = sesInfoAlloc();
if (ses == NULL)
@@ -2278,10 +2281,10 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
list_add(&ses->smb_ses_list, &server->smb_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
- free_xid(xid);
-
cifs_setup_ipc(ses, ctx);
+ free_xid(xid);
+
return ses;
get_ses_fail:
@@ -2291,11 +2294,12 @@ get_ses_fail:
}
/* this function must be called with tc_lock held */
-static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
+static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx, bool dfs_super_cmp)
{
if (tcon->status == TID_EXITING)
return 0;
- if (strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE))
+ /* Skip UNC validation when matching DFS superblocks */
+ if (!dfs_super_cmp && strncmp(tcon->tree_name, ctx->UNC, MAX_TREE_SIZE))
return 0;
if (tcon->seal != ctx->seal)
return 0;
@@ -2318,7 +2322,7 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
spin_lock(&tcon->tc_lock);
- if (!match_tcon(tcon, ctx)) {
+ if (!match_tcon(tcon, ctx, false)) {
spin_unlock(&tcon->tc_lock);
continue;
}
@@ -2600,12 +2604,16 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->nodelete = ctx->nodelete;
tcon->local_lease = ctx->local_lease;
INIT_LIST_HEAD(&tcon->pending_opens);
+ tcon->status = TID_GOOD;
- /* schedule query interfaces poll */
INIT_DELAYED_WORK(&tcon->query_interfaces,
smb2_query_server_interfaces);
- queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
- (SMB_INTERFACE_POLL_INTERVAL * HZ));
+ if (ses->server->dialect >= SMB30_PROT_ID &&
+ (ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ /* schedule query interfaces poll */
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+ }
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcon->tcon_list, &ses->tcon_list);
@@ -2712,6 +2720,7 @@ cifs_match_super(struct super_block *sb, void *data)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct tcon_link *tlink;
+ bool dfs_super_cmp;
int rc = 0;
spin_lock(&cifs_tcp_ses_lock);
@@ -2726,14 +2735,16 @@ cifs_match_super(struct super_block *sb, void *data)
ses = tcon->ses;
tcp_srv = ses->server;
+ dfs_super_cmp = IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && tcp_srv->origin_fullpath;
+
ctx = mnt_data->ctx;
spin_lock(&tcp_srv->srv_lock);
spin_lock(&ses->ses_lock);
spin_lock(&tcon->tc_lock);
- if (!match_server(tcp_srv, ctx) ||
+ if (!match_server(tcp_srv, ctx, dfs_super_cmp) ||
!match_session(ses, ctx) ||
- !match_tcon(tcon, ctx) ||
+ !match_tcon(tcon, ctx, dfs_super_cmp) ||
!match_prepath(sb, mnt_data)) {
rc = 0;
goto out;
@@ -2944,6 +2955,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Socket created\n");
server->ssocket = socket;
socket->sk->sk_allocation = GFP_NOFS;
+ socket->sk->sk_use_task_frag = false;
if (sfamily == AF_INET6)
cifs_reclassify_socket6(socket);
else
@@ -3190,7 +3202,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
}
/* Release all succeed connections */
-static inline void mount_put_conns(struct mount_ctx *mnt_ctx)
+void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx)
{
int rc = 0;
@@ -3204,19 +3216,22 @@ static inline void mount_put_conns(struct mount_ctx *mnt_ctx)
free_xid(mnt_ctx->xid);
}
-/* Get connections for tcp, ses and tcon */
-static int mount_get_conns(struct mount_ctx *mnt_ctx)
+int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx)
{
- int rc = 0;
struct TCP_Server_Info *server = NULL;
+ struct smb3_fs_context *ctx;
struct cifs_ses *ses = NULL;
- struct cifs_tcon *tcon = NULL;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
unsigned int xid;
+ int rc = 0;
xid = get_xid();
+ if (WARN_ON_ONCE(!mnt_ctx || !mnt_ctx->fs_ctx)) {
+ rc = -EINVAL;
+ goto out;
+ }
+ ctx = mnt_ctx->fs_ctx;
+
/* get a reference to a tcp session */
server = cifs_get_tcp_session(ctx, NULL);
if (IS_ERR(server)) {
@@ -3237,11 +3252,36 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
SMB2_GLOBAL_CAP_PERSISTENT_HANDLES))) {
cifs_server_dbg(VFS, "persistent handles not supported by server\n");
rc = -EOPNOTSUPP;
+ }
+
+out:
+ mnt_ctx->xid = xid;
+ mnt_ctx->server = server;
+ mnt_ctx->ses = ses;
+ mnt_ctx->tcon = NULL;
+
+ return rc;
+}
+
+int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
+{
+ struct TCP_Server_Info *server;
+ struct cifs_sb_info *cifs_sb;
+ struct smb3_fs_context *ctx;
+ struct cifs_tcon *tcon = NULL;
+ int rc = 0;
+
+ if (WARN_ON_ONCE(!mnt_ctx || !mnt_ctx->server || !mnt_ctx->ses || !mnt_ctx->fs_ctx ||
+ !mnt_ctx->cifs_sb)) {
+ rc = -EINVAL;
goto out;
}
+ server = mnt_ctx->server;
+ ctx = mnt_ctx->fs_ctx;
+ cifs_sb = mnt_ctx->cifs_sb;
/* search for existing tcon to this server share */
- tcon = cifs_get_tcon(ses, ctx);
+ tcon = cifs_get_tcon(mnt_ctx->ses, ctx);
if (IS_ERR(tcon)) {
rc = PTR_ERR(tcon);
tcon = NULL;
@@ -3259,7 +3299,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
* reset of caps checks mount to see if unix extensions disabled
* for just this mount.
*/
- reset_cifs_unix_caps(xid, tcon, cifs_sb, ctx);
+ reset_cifs_unix_caps(mnt_ctx->xid, tcon, cifs_sb, ctx);
spin_lock(&tcon->ses->server->srv_lock);
if ((tcon->ses->server->tcpStatus == CifsNeedReconnect) &&
(le64_to_cpu(tcon->fsUnixInfo.Capability) &
@@ -3275,7 +3315,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
/* do not care if a following call succeed - informational */
if (!tcon->pipe && server->ops->qfs_tcon) {
- server->ops->qfs_tcon(xid, tcon, cifs_sb);
+ server->ops->qfs_tcon(mnt_ctx->xid, tcon, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) {
if (tcon->fsDevInfo.DeviceCharacteristics &
cpu_to_le32(FILE_READ_ONLY_DEVICE))
@@ -3308,11 +3348,7 @@ static int mount_get_conns(struct mount_ctx *mnt_ctx)
cifs_fscache_get_super_cookie(tcon);
out:
- mnt_ctx->server = server;
- mnt_ctx->ses = ses;
mnt_ctx->tcon = tcon;
- mnt_ctx->xid = xid;
-
return rc;
}
@@ -3342,146 +3378,6 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
return 0;
}
-#ifdef CONFIG_CIFS_DFS_UPCALL
-/* Get unique dfs connections */
-static int mount_get_dfs_conns(struct mount_ctx *mnt_ctx)
-{
- int rc;
-
- mnt_ctx->fs_ctx->nosharesock = true;
- rc = mount_get_conns(mnt_ctx);
- if (mnt_ctx->server) {
- cifs_dbg(FYI, "%s: marking tcp session as a dfs connection\n", __func__);
- spin_lock(&mnt_ctx->server->srv_lock);
- mnt_ctx->server->is_dfs_conn = true;
- spin_unlock(&mnt_ctx->server->srv_lock);
- }
- return rc;
-}
-
-/*
- * cifs_build_path_to_root returns full path to root when we do not have an
- * existing connection (tcon)
- */
-static char *
-build_unc_path_to_root(const struct smb3_fs_context *ctx,
- const struct cifs_sb_info *cifs_sb, bool useppath)
-{
- char *full_path, *pos;
- unsigned int pplen = useppath && ctx->prepath ?
- strlen(ctx->prepath) + 1 : 0;
- unsigned int unc_len = strnlen(ctx->UNC, MAX_TREE_SIZE + 1);
-
- if (unc_len > MAX_TREE_SIZE)
- return ERR_PTR(-EINVAL);
-
- full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
- if (full_path == NULL)
- return ERR_PTR(-ENOMEM);
-
- memcpy(full_path, ctx->UNC, unc_len);
- pos = full_path + unc_len;
-
- if (pplen) {
- *pos = CIFS_DIR_SEP(cifs_sb);
- memcpy(pos + 1, ctx->prepath, pplen);
- pos += pplen;
- }
-
- *pos = '\0'; /* add trailing null */
- convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
- cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path);
- return full_path;
-}
-
-/*
- * expand_dfs_referral - Update cifs_sb from dfs referral path
- *
- * cifs_sb->ctx->mount_options will be (re-)allocated to a string containing updated options for the
- * submount. Otherwise it will be left untouched.
- */
-static int expand_dfs_referral(struct mount_ctx *mnt_ctx, const char *full_path,
- struct dfs_info3_param *referral)
-{
- int rc;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- char *fake_devname = NULL, *mdata = NULL;
-
- mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, full_path + 1, referral,
- &fake_devname);
- if (IS_ERR(mdata)) {
- rc = PTR_ERR(mdata);
- mdata = NULL;
- } else {
- /*
- * We can not clear out the whole structure since we no longer have an explicit
- * function to parse a mount-string. Instead we need to clear out the individual
- * fields that are no longer valid.
- */
- kfree(ctx->prepath);
- ctx->prepath = NULL;
- rc = cifs_setup_volume_info(ctx, mdata, fake_devname);
- }
- kfree(fake_devname);
- kfree(cifs_sb->ctx->mount_options);
- cifs_sb->ctx->mount_options = mdata;
-
- return rc;
-}
-#endif
-
-/* TODO: all callers to this are broken. We are not parsing mount_options here
- * we should pass a clone of the original context?
- */
-int
-cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname)
-{
- int rc;
-
- if (devname) {
- cifs_dbg(FYI, "%s: devname=%s\n", __func__, devname);
- rc = smb3_parse_devname(devname, ctx);
- if (rc) {
- cifs_dbg(VFS, "%s: failed to parse %s: %d\n", __func__, devname, rc);
- return rc;
- }
- }
-
- if (mntopts) {
- char *ip;
-
- rc = smb3_parse_opt(mntopts, "ip", &ip);
- if (rc) {
- cifs_dbg(VFS, "%s: failed to parse ip options: %d\n", __func__, rc);
- return rc;
- }
-
- rc = cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, strlen(ip));
- kfree(ip);
- if (!rc) {
- cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__);
- return -EINVAL;
- }
- }
-
- if (ctx->nullauth) {
- cifs_dbg(FYI, "Anonymous login\n");
- kfree(ctx->username);
- ctx->username = NULL;
- } else if (ctx->username) {
- /* BB fixme parse for domain name here */
- cifs_dbg(FYI, "Username: %s\n", ctx->username);
- } else {
- cifs_dbg(VFS, "No username specified\n");
- /* In userspace mount helper we can get user name from alternate
- locations such as env variables and files on disk */
- return -EINVAL;
- }
-
- return 0;
-}
-
static int
cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
unsigned int xid,
@@ -3534,7 +3430,7 @@ cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
*
* Return -EREMOTE if it is, otherwise 0 or -errno.
*/
-static int is_path_remote(struct mount_ctx *mnt_ctx)
+int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx)
{
int rc;
struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
@@ -3543,9 +3439,6 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
struct cifs_tcon *tcon = mnt_ctx->tcon;
struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
char *full_path;
-#ifdef CONFIG_CIFS_DFS_UPCALL
- bool nodfs = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS;
-#endif
if (!server->ops->is_path_accessible)
return -EOPNOTSUPP;
@@ -3562,19 +3455,6 @@ static int is_path_remote(struct mount_ctx *mnt_ctx)
rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
full_path);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- if (nodfs) {
- if (rc == -EREMOTE)
- rc = -EOPNOTSUPP;
- goto out;
- }
-
- /* path *might* exist with non-ASCII characters in DFS root
- * try again with full path (only if nodfs is not set) */
- if (rc == -ENOENT && is_tcon_dfs(tcon))
- rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon, cifs_sb,
- full_path);
-#endif
if (rc != 0 && rc != -EREMOTE)
goto out;
@@ -3594,251 +3474,19 @@ out:
}
#ifdef CONFIG_CIFS_DFS_UPCALL
-static void set_root_ses(struct mount_ctx *mnt_ctx)
-{
- if (mnt_ctx->ses) {
- spin_lock(&cifs_tcp_ses_lock);
- mnt_ctx->ses->ses_count++;
- spin_unlock(&cifs_tcp_ses_lock);
- dfs_cache_add_refsrv_session(&mnt_ctx->mount_id, mnt_ctx->ses);
- }
- mnt_ctx->root_ses = mnt_ctx->ses;
-}
-
-static int is_dfs_mount(struct mount_ctx *mnt_ctx, bool *isdfs, struct dfs_cache_tgt_list *root_tl)
-{
- int rc;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
-
- *isdfs = true;
-
- rc = mount_get_conns(mnt_ctx);
- /*
- * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally
- * try to get an DFS referral (even cached) to determine whether it is an DFS mount.
- *
- * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem
- * to respond with PATH_NOT_COVERED to requests that include the prefix.
- */
- if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
- dfs_cache_find(mnt_ctx->xid, mnt_ctx->ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
- ctx->UNC + 1, NULL, root_tl)) {
- if (rc)
- return rc;
- /* Check if it is fully accessible and then mount it */
- rc = is_path_remote(mnt_ctx);
- if (!rc)
- *isdfs = false;
- else if (rc != -EREMOTE)
- return rc;
- }
- return 0;
-}
-
-static int connect_dfs_target(struct mount_ctx *mnt_ctx, const char *full_path,
- const char *ref_path, struct dfs_cache_tgt_iterator *tit)
-{
- int rc;
- struct dfs_info3_param ref = {};
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- char *oldmnt = cifs_sb->ctx->mount_options;
-
- cifs_dbg(FYI, "%s: full_path=%s ref_path=%s target=%s\n", __func__, full_path, ref_path,
- dfs_cache_get_tgt_name(tit));
-
- rc = dfs_cache_get_tgt_referral(ref_path, tit, &ref);
- if (rc)
- goto out;
-
- rc = expand_dfs_referral(mnt_ctx, full_path, &ref);
- if (rc)
- goto out;
-
- /* Connect to new target only if we were redirected (e.g. mount options changed) */
- if (oldmnt != cifs_sb->ctx->mount_options) {
- mount_put_conns(mnt_ctx);
- rc = mount_get_dfs_conns(mnt_ctx);
- }
- if (!rc) {
- if (cifs_is_referral_server(mnt_ctx->tcon, &ref))
- set_root_ses(mnt_ctx);
- rc = dfs_cache_update_tgthint(mnt_ctx->xid, mnt_ctx->root_ses, cifs_sb->local_nls,
- cifs_remap(cifs_sb), ref_path, tit);
- }
-
-out:
- free_dfs_info_param(&ref);
- return rc;
-}
-
-static int connect_dfs_root(struct mount_ctx *mnt_ctx, struct dfs_cache_tgt_list *root_tl)
-{
- int rc;
- char *full_path;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- struct dfs_cache_tgt_iterator *tit;
-
- /* Put initial connections as they might be shared with other mounts. We need unique dfs
- * connections per mount to properly failover, so mount_get_dfs_conns() must be used from
- * now on.
- */
- mount_put_conns(mnt_ctx);
- mount_get_dfs_conns(mnt_ctx);
- set_root_ses(mnt_ctx);
-
- full_path = build_unc_path_to_root(ctx, cifs_sb, true);
- if (IS_ERR(full_path))
- return PTR_ERR(full_path);
-
- mnt_ctx->origin_fullpath = dfs_cache_canonical_path(ctx->UNC, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (IS_ERR(mnt_ctx->origin_fullpath)) {
- rc = PTR_ERR(mnt_ctx->origin_fullpath);
- mnt_ctx->origin_fullpath = NULL;
- goto out;
- }
-
- /* Try all dfs root targets */
- for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(root_tl);
- tit; tit = dfs_cache_get_next_tgt(root_tl, tit)) {
- rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->origin_fullpath + 1, tit);
- if (!rc) {
- mnt_ctx->leaf_fullpath = kstrdup(mnt_ctx->origin_fullpath, GFP_KERNEL);
- if (!mnt_ctx->leaf_fullpath)
- rc = -ENOMEM;
- break;
- }
- }
-
-out:
- kfree(full_path);
- return rc;
-}
-
-static int __follow_dfs_link(struct mount_ctx *mnt_ctx)
-{
- int rc;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- char *full_path;
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
- struct dfs_cache_tgt_iterator *tit;
-
- full_path = build_unc_path_to_root(ctx, cifs_sb, true);
- if (IS_ERR(full_path))
- return PTR_ERR(full_path);
-
- kfree(mnt_ctx->leaf_fullpath);
- mnt_ctx->leaf_fullpath = dfs_cache_canonical_path(full_path, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (IS_ERR(mnt_ctx->leaf_fullpath)) {
- rc = PTR_ERR(mnt_ctx->leaf_fullpath);
- mnt_ctx->leaf_fullpath = NULL;
- goto out;
- }
-
- /* Get referral from dfs link */
- rc = dfs_cache_find(mnt_ctx->xid, mnt_ctx->root_ses, cifs_sb->local_nls,
- cifs_remap(cifs_sb), mnt_ctx->leaf_fullpath + 1, NULL, &tl);
- if (rc)
- goto out;
-
- /* Try all dfs link targets. If an I/O fails from currently connected DFS target with an
- * error other than STATUS_PATH_NOT_COVERED (-EREMOTE), then retry it from other targets as
- * specified in MS-DFSC "3.1.5.2 I/O Operation to Target Fails with an Error Other Than
- * STATUS_PATH_NOT_COVERED."
- */
- for (rc = -ENOENT, tit = dfs_cache_get_tgt_iterator(&tl);
- tit; tit = dfs_cache_get_next_tgt(&tl, tit)) {
- rc = connect_dfs_target(mnt_ctx, full_path, mnt_ctx->leaf_fullpath + 1, tit);
- if (!rc) {
- rc = is_path_remote(mnt_ctx);
- if (!rc || rc == -EREMOTE)
- break;
- }
- }
-
-out:
- kfree(full_path);
- dfs_cache_free_tgts(&tl);
- return rc;
-}
-
-static int follow_dfs_link(struct mount_ctx *mnt_ctx)
-{
- int rc;
- struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
- struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
- char *full_path;
- int num_links = 0;
-
- full_path = build_unc_path_to_root(ctx, cifs_sb, true);
- if (IS_ERR(full_path))
- return PTR_ERR(full_path);
-
- kfree(mnt_ctx->origin_fullpath);
- mnt_ctx->origin_fullpath = dfs_cache_canonical_path(full_path, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- kfree(full_path);
-
- if (IS_ERR(mnt_ctx->origin_fullpath)) {
- rc = PTR_ERR(mnt_ctx->origin_fullpath);
- mnt_ctx->origin_fullpath = NULL;
- return rc;
- }
-
- do {
- rc = __follow_dfs_link(mnt_ctx);
- if (!rc || rc != -EREMOTE)
- break;
- } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS);
-
- return rc;
-}
-
-/* Set up DFS referral paths for failover */
-static void setup_server_referral_paths(struct mount_ctx *mnt_ctx)
-{
- struct TCP_Server_Info *server = mnt_ctx->server;
-
- mutex_lock(&server->refpath_lock);
- server->origin_fullpath = mnt_ctx->origin_fullpath;
- server->leaf_fullpath = mnt_ctx->leaf_fullpath;
- server->current_fullpath = mnt_ctx->leaf_fullpath;
- mutex_unlock(&server->refpath_lock);
- mnt_ctx->origin_fullpath = mnt_ctx->leaf_fullpath = NULL;
-}
-
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
- int rc;
- struct mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ struct cifs_mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
bool isdfs;
+ int rc;
- rc = is_dfs_mount(&mnt_ctx, &isdfs, &tl);
+ uuid_gen(&mnt_ctx.mount_id);
+ rc = dfs_mount_share(&mnt_ctx, &isdfs);
if (rc)
goto error;
if (!isdfs)
goto out;
- /* proceed as DFS mount */
- uuid_gen(&mnt_ctx.mount_id);
- rc = connect_dfs_root(&mnt_ctx, &tl);
- dfs_cache_free_tgts(&tl);
-
- if (rc)
- goto error;
-
- rc = is_path_remote(&mnt_ctx);
- if (rc)
- rc = follow_dfs_link(&mnt_ctx);
- if (rc)
- goto error;
-
- setup_server_referral_paths(&mnt_ctx);
/*
* After reconnecting to a different server, unique ids won't match anymore, so we disable
* serverino. This prevents dentry revalidation to think the dentry are stale (ESTALE).
@@ -3867,26 +3515,28 @@ error:
dfs_cache_put_refsrv_sessions(&mnt_ctx.mount_id);
kfree(mnt_ctx.origin_fullpath);
kfree(mnt_ctx.leaf_fullpath);
- mount_put_conns(&mnt_ctx);
+ cifs_mount_put_conns(&mnt_ctx);
return rc;
}
#else
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
{
int rc = 0;
- struct mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
+ struct cifs_mount_ctx mnt_ctx = { .cifs_sb = cifs_sb, .fs_ctx = ctx, };
- rc = mount_get_conns(&mnt_ctx);
+ rc = cifs_mount_get_session(&mnt_ctx);
if (rc)
goto error;
- if (mnt_ctx.tcon) {
- rc = is_path_remote(&mnt_ctx);
- if (rc == -EREMOTE)
- rc = -EOPNOTSUPP;
- if (rc)
- goto error;
- }
+ rc = cifs_mount_get_tcon(&mnt_ctx);
+ if (rc)
+ goto error;
+
+ rc = cifs_is_path_remote(&mnt_ctx);
+ if (rc == -EREMOTE)
+ rc = -EOPNOTSUPP;
+ if (rc)
+ goto error;
rc = mount_setup_tlink(cifs_sb, mnt_ctx.ses, mnt_ctx.tcon);
if (rc)
@@ -3896,7 +3546,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
return rc;
error:
- mount_put_conns(&mnt_ctx);
+ cifs_mount_put_conns(&mnt_ctx);
return rc;
}
#endif
@@ -4449,264 +4099,7 @@ cifs_prune_tlinks(struct work_struct *work)
TLINK_IDLE_EXPIRE);
}
-#ifdef CONFIG_CIFS_DFS_UPCALL
-/* Update dfs referral path of superblock */
-static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb,
- const char *target)
-{
- int rc = 0;
- size_t len = strlen(target);
- char *refpath, *npath;
-
- if (unlikely(len < 2 || *target != '\\'))
- return -EINVAL;
-
- if (target[1] == '\\') {
- len += 1;
- refpath = kmalloc(len, GFP_KERNEL);
- if (!refpath)
- return -ENOMEM;
-
- scnprintf(refpath, len, "%s", target);
- } else {
- len += sizeof("\\");
- refpath = kmalloc(len, GFP_KERNEL);
- if (!refpath)
- return -ENOMEM;
-
- scnprintf(refpath, len, "\\%s", target);
- }
-
- npath = dfs_cache_canonical_path(refpath, cifs_sb->local_nls, cifs_remap(cifs_sb));
- kfree(refpath);
-
- if (IS_ERR(npath)) {
- rc = PTR_ERR(npath);
- } else {
- mutex_lock(&server->refpath_lock);
- kfree(server->leaf_fullpath);
- server->leaf_fullpath = npath;
- mutex_unlock(&server->refpath_lock);
- server->current_fullpath = server->leaf_fullpath;
- }
- return rc;
-}
-
-static int target_share_matches_server(struct TCP_Server_Info *server, const char *tcp_host,
- size_t tcp_host_len, char *share, bool *target_match)
-{
- int rc = 0;
- const char *dfs_host;
- size_t dfs_host_len;
-
- *target_match = true;
- extract_unc_hostname(share, &dfs_host, &dfs_host_len);
-
- /* Check if hostnames or addresses match */
- if (dfs_host_len != tcp_host_len || strncasecmp(dfs_host, tcp_host, dfs_host_len) != 0) {
- cifs_dbg(FYI, "%s: %.*s doesn't match %.*s\n", __func__, (int)dfs_host_len,
- dfs_host, (int)tcp_host_len, tcp_host);
- rc = match_target_ip(server, dfs_host, dfs_host_len, target_match);
- if (rc)
- cifs_dbg(VFS, "%s: failed to match target ip: %d\n", __func__, rc);
- }
- return rc;
-}
-
-static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, char *tree, bool islink,
- struct dfs_cache_tgt_list *tl)
-{
- int rc;
- struct TCP_Server_Info *server = tcon->ses->server;
- const struct smb_version_operations *ops = server->ops;
- struct cifs_tcon *ipc = tcon->ses->tcon_ipc;
- char *share = NULL, *prefix = NULL;
- const char *tcp_host;
- size_t tcp_host_len;
- struct dfs_cache_tgt_iterator *tit;
- bool target_match;
-
- extract_unc_hostname(server->hostname, &tcp_host, &tcp_host_len);
-
- tit = dfs_cache_get_tgt_iterator(tl);
- if (!tit) {
- rc = -ENOENT;
- goto out;
- }
-
- /* Try to tree connect to all dfs targets */
- for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) {
- const char *target = dfs_cache_get_tgt_name(tit);
- struct dfs_cache_tgt_list ntl = DFS_CACHE_TGT_LIST_INIT(ntl);
-
- kfree(share);
- kfree(prefix);
- share = prefix = NULL;
-
- /* Check if share matches with tcp ses */
- rc = dfs_cache_get_tgt_share(server->current_fullpath + 1, tit, &share, &prefix);
- if (rc) {
- cifs_dbg(VFS, "%s: failed to parse target share: %d\n", __func__, rc);
- break;
- }
-
- rc = target_share_matches_server(server, tcp_host, tcp_host_len, share,
- &target_match);
- if (rc)
- break;
- if (!target_match) {
- rc = -EHOSTUNREACH;
- continue;
- }
-
- if (ipc->need_reconnect) {
- scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
- rc = ops->tree_connect(xid, ipc->ses, tree, ipc, cifs_sb->local_nls);
- if (rc)
- break;
- }
-
- scnprintf(tree, MAX_TREE_SIZE, "\\%s", share);
- if (!islink) {
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
- break;
- }
- /*
- * If no dfs referrals were returned from link target, then just do a TREE_CONNECT
- * to it. Otherwise, cache the dfs referral and then mark current tcp ses for
- * reconnect so either the demultiplex thread or the echo worker will reconnect to
- * newly resolved target.
- */
- if (dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls, cifs_remap(cifs_sb), target,
- NULL, &ntl)) {
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
- if (rc)
- continue;
- rc = dfs_cache_noreq_update_tgthint(server->current_fullpath + 1, tit);
- if (!rc)
- rc = cifs_update_super_prepath(cifs_sb, prefix);
- } else {
- /* Target is another dfs share */
- rc = update_server_fullpath(server, cifs_sb, target);
- dfs_cache_free_tgts(tl);
-
- if (!rc) {
- rc = -EREMOTE;
- list_replace_init(&ntl.tl_list, &tl->tl_list);
- } else
- dfs_cache_free_tgts(&ntl);
- }
- break;
- }
-
-out:
- kfree(share);
- kfree(prefix);
-
- return rc;
-}
-
-static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb, char *tree, bool islink,
- struct dfs_cache_tgt_list *tl)
-{
- int rc;
- int num_links = 0;
- struct TCP_Server_Info *server = tcon->ses->server;
-
- do {
- rc = __tree_connect_dfs_target(xid, tcon, cifs_sb, tree, islink, tl);
- if (!rc || rc != -EREMOTE)
- break;
- } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS);
- /*
- * If we couldn't tree connect to any targets from last referral path, then retry from
- * original referral path.
- */
- if (rc && server->current_fullpath != server->origin_fullpath) {
- server->current_fullpath = server->origin_fullpath;
- cifs_signal_cifsd_for_reconnect(server, true);
- }
-
- dfs_cache_free_tgts(tl);
- return rc;
-}
-
-int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
-{
- int rc;
- struct TCP_Server_Info *server = tcon->ses->server;
- const struct smb_version_operations *ops = server->ops;
- struct super_block *sb = NULL;
- struct cifs_sb_info *cifs_sb;
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
- char *tree;
- struct dfs_info3_param ref = {0};
-
- /* only send once per connect */
- spin_lock(&tcon->tc_lock);
- if (tcon->ses->ses_status != SES_GOOD ||
- (tcon->status != TID_NEW &&
- tcon->status != TID_NEED_TCON)) {
- spin_unlock(&tcon->tc_lock);
- return 0;
- }
- tcon->status = TID_IN_TCON;
- spin_unlock(&tcon->tc_lock);
-
- tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
- if (!tree) {
- rc = -ENOMEM;
- goto out;
- }
-
- if (tcon->ipc) {
- scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
- rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
- goto out;
- }
-
- sb = cifs_get_tcp_super(server);
- if (IS_ERR(sb)) {
- rc = PTR_ERR(sb);
- cifs_dbg(VFS, "%s: could not find superblock: %d\n", __func__, rc);
- goto out;
- }
-
- cifs_sb = CIFS_SB(sb);
-
- /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
- if (!server->current_fullpath ||
- dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) {
- rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls);
- goto out;
- }
-
- rc = tree_connect_dfs_target(xid, tcon, cifs_sb, tree, ref.server_type == DFS_TYPE_LINK,
- &tl);
- free_dfs_info_param(&ref);
-
-out:
- kfree(tree);
- cifs_put_tcp_super(sb);
-
- if (rc) {
- spin_lock(&tcon->tc_lock);
- if (tcon->status == TID_IN_TCON)
- tcon->status = TID_NEED_TCON;
- spin_unlock(&tcon->tc_lock);
- } else {
- spin_lock(&tcon->tc_lock);
- if (tcon->status == TID_IN_TCON)
- tcon->status = TID_GOOD;
- spin_unlock(&tcon->tc_lock);
- tcon->need_reconnect = false;
- }
-
- return rc;
-}
-#else
+#ifndef CONFIG_CIFS_DFS_UPCALL
int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
{
int rc;
diff --git a/fs/cifs/dfs.c b/fs/cifs/dfs.c
new file mode 100644
index 000000000000..b64d20374b9c
--- /dev/null
+++ b/fs/cifs/dfs.c
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#include <linux/namei.h>
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "dns_resolve.h"
+#include "fs_context.h"
+#include "dfs.h"
+
+/**
+ * dfs_parse_target_referral - set fs context for dfs target referral
+ *
+ * @full_path: full path in UNC format.
+ * @ref: dfs referral pointer.
+ * @ctx: smb3 fs context pointer.
+ *
+ * Return zero if dfs referral was parsed correctly, otherwise non-zero.
+ */
+int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref,
+ struct smb3_fs_context *ctx)
+{
+ int rc;
+ const char *prepath = NULL;
+ char *path;
+
+ if (!full_path || !*full_path || !ref || !ctx)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!ref->node_name || ref->path_consumed < 0))
+ return -EINVAL;
+
+ if (strlen(full_path) - ref->path_consumed) {
+ prepath = full_path + ref->path_consumed;
+ /* skip initial delimiter */
+ if (*prepath == '/' || *prepath == '\\')
+ prepath++;
+ }
+
+ path = cifs_build_devname(ref->node_name, prepath);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ rc = smb3_parse_devname(path, ctx);
+ if (rc)
+ goto out;
+
+ rc = dns_resolve_server_name_to_ip(path, (struct sockaddr *)&ctx->dstaddr, NULL);
+
+out:
+ kfree(path);
+ return rc;
+}
+
+/*
+ * cifs_build_path_to_root returns full path to root when we do not have an
+ * existing connection (tcon)
+ */
+static char *build_unc_path_to_root(const struct smb3_fs_context *ctx,
+ const struct cifs_sb_info *cifs_sb, bool useppath)
+{
+ char *full_path, *pos;
+ unsigned int pplen = useppath && ctx->prepath ? strlen(ctx->prepath) + 1 : 0;
+ unsigned int unc_len = strnlen(ctx->UNC, MAX_TREE_SIZE + 1);
+
+ if (unc_len > MAX_TREE_SIZE)
+ return ERR_PTR(-EINVAL);
+
+ full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
+ if (full_path == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(full_path, ctx->UNC, unc_len);
+ pos = full_path + unc_len;
+
+ if (pplen) {
+ *pos = CIFS_DIR_SEP(cifs_sb);
+ memcpy(pos + 1, ctx->prepath, pplen);
+ pos += pplen;
+ }
+
+ *pos = '\0'; /* add trailing null */
+ convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+ cifs_dbg(FYI, "%s: full_path=%s\n", __func__, full_path);
+ return full_path;
+}
+
+static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path)
+{
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ int rc;
+
+ ctx->leaf_fullpath = (char *)full_path;
+ rc = cifs_mount_get_session(mnt_ctx);
+ ctx->leaf_fullpath = NULL;
+ if (!rc) {
+ struct cifs_ses *ses = mnt_ctx->ses;
+
+ mutex_lock(&ses->session_mutex);
+ ses->dfs_root_ses = mnt_ctx->root_ses;
+ mutex_unlock(&ses->session_mutex);
+ }
+ return rc;
+}
+
+static void set_root_ses(struct cifs_mount_ctx *mnt_ctx)
+{
+ if (mnt_ctx->ses) {
+ spin_lock(&cifs_tcp_ses_lock);
+ mnt_ctx->ses->ses_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+ dfs_cache_add_refsrv_session(&mnt_ctx->mount_id, mnt_ctx->ses);
+ }
+ mnt_ctx->root_ses = mnt_ctx->ses;
+}
+
+static int get_dfs_conn(struct cifs_mount_ctx *mnt_ctx, const char *ref_path, const char *full_path,
+ const struct dfs_cache_tgt_iterator *tit)
+{
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ struct dfs_info3_param ref = {};
+ int rc;
+
+ rc = dfs_cache_get_tgt_referral(ref_path + 1, tit, &ref);
+ if (rc)
+ return rc;
+
+ rc = dfs_parse_target_referral(full_path + 1, &ref, ctx);
+ if (rc)
+ goto out;
+
+ cifs_mount_put_conns(mnt_ctx);
+ rc = get_session(mnt_ctx, ref_path);
+ if (rc)
+ goto out;
+
+ if (ref.flags & DFSREF_REFERRAL_SERVER)
+ set_root_ses(mnt_ctx);
+
+ rc = -EREMOTE;
+ if (ref.flags & DFSREF_STORAGE_SERVER) {
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (rc)
+ goto out;
+
+ /* some servers may not advertise referral capability under ref.flags */
+ if (!(ref.flags & DFSREF_REFERRAL_SERVER) &&
+ is_tcon_dfs(mnt_ctx->tcon))
+ set_root_ses(mnt_ctx);
+
+ rc = cifs_is_path_remote(mnt_ctx);
+ }
+
+out:
+ free_dfs_info_param(&ref);
+ return rc;
+}
+
+static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
+{
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ char *ref_path = NULL, *full_path = NULL;
+ struct dfs_cache_tgt_iterator *tit;
+ struct TCP_Server_Info *server;
+ char *origin_fullpath = NULL;
+ int num_links = 0;
+ int rc;
+
+ ref_path = dfs_get_path(cifs_sb, ctx->UNC);
+ if (IS_ERR(ref_path))
+ return PTR_ERR(ref_path);
+
+ full_path = build_unc_path_to_root(ctx, cifs_sb, true);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ full_path = NULL;
+ goto out;
+ }
+
+ origin_fullpath = kstrdup(full_path, GFP_KERNEL);
+ if (!origin_fullpath) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+
+ rc = dfs_get_referral(mnt_ctx, ref_path + 1, NULL, &tl);
+ if (rc)
+ break;
+
+ tit = dfs_cache_get_tgt_iterator(&tl);
+ if (!tit) {
+ cifs_dbg(VFS, "%s: dfs referral (%s) with no targets\n", __func__,
+ ref_path + 1);
+ rc = -ENOENT;
+ dfs_cache_free_tgts(&tl);
+ break;
+ }
+
+ do {
+ rc = get_dfs_conn(mnt_ctx, ref_path, full_path, tit);
+ if (!rc)
+ break;
+ if (rc == -EREMOTE) {
+ if (++num_links > MAX_NESTED_LINKS) {
+ rc = -ELOOP;
+ break;
+ }
+ kfree(ref_path);
+ kfree(full_path);
+ ref_path = full_path = NULL;
+
+ full_path = build_unc_path_to_root(ctx, cifs_sb, true);
+ if (IS_ERR(full_path)) {
+ rc = PTR_ERR(full_path);
+ full_path = NULL;
+ } else {
+ ref_path = dfs_get_path(cifs_sb, full_path);
+ if (IS_ERR(ref_path)) {
+ rc = PTR_ERR(ref_path);
+ ref_path = NULL;
+ }
+ }
+ break;
+ }
+ } while ((tit = dfs_cache_get_next_tgt(&tl, tit)));
+ dfs_cache_free_tgts(&tl);
+ } while (rc == -EREMOTE);
+
+ if (!rc) {
+ server = mnt_ctx->server;
+
+ mutex_lock(&server->refpath_lock);
+ server->origin_fullpath = origin_fullpath;
+ server->current_fullpath = server->leaf_fullpath;
+ mutex_unlock(&server->refpath_lock);
+ origin_fullpath = NULL;
+ }
+
+out:
+ kfree(origin_fullpath);
+ kfree(ref_path);
+ kfree(full_path);
+ return rc;
+}
+
+int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
+{
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+ struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+ int rc;
+
+ *isdfs = false;
+
+ rc = get_session(mnt_ctx, NULL);
+ if (rc)
+ return rc;
+ mnt_ctx->root_ses = mnt_ctx->ses;
+ /*
+ * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally
+ * try to get an DFS referral (even cached) to determine whether it is an DFS mount.
+ *
+ * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem
+ * to respond with PATH_NOT_COVERED to requests that include the prefix.
+ */
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) ||
+ dfs_get_referral(mnt_ctx, ctx->UNC + 1, NULL, NULL)) {
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (rc)
+ return rc;
+
+ rc = cifs_is_path_remote(mnt_ctx);
+ if (!rc || rc != -EREMOTE)
+ return rc;
+ }
+
+ *isdfs = true;
+ set_root_ses(mnt_ctx);
+
+ return __dfs_mount_share(mnt_ctx);
+}
+
+/* Update dfs referral path of superblock */
+static int update_server_fullpath(struct TCP_Server_Info *server, struct cifs_sb_info *cifs_sb,
+ const char *target)
+{
+ int rc = 0;
+ size_t len = strlen(target);
+ char *refpath, *npath;
+
+ if (unlikely(len < 2 || *target != '\\'))
+ return -EINVAL;
+
+ if (target[1] == '\\') {
+ len += 1;
+ refpath = kmalloc(len, GFP_KERNEL);
+ if (!refpath)
+ return -ENOMEM;
+
+ scnprintf(refpath, len, "%s", target);
+ } else {
+ len += sizeof("\\");
+ refpath = kmalloc(len, GFP_KERNEL);
+ if (!refpath)
+ return -ENOMEM;
+
+ scnprintf(refpath, len, "\\%s", target);
+ }
+
+ npath = dfs_cache_canonical_path(refpath, cifs_sb->local_nls, cifs_remap(cifs_sb));
+ kfree(refpath);
+
+ if (IS_ERR(npath)) {
+ rc = PTR_ERR(npath);
+ } else {
+ mutex_lock(&server->refpath_lock);
+ kfree(server->leaf_fullpath);
+ server->leaf_fullpath = npath;
+ mutex_unlock(&server->refpath_lock);
+ server->current_fullpath = server->leaf_fullpath;
+ }
+ return rc;
+}
+
+static int target_share_matches_server(struct TCP_Server_Info *server, char *share,
+ bool *target_match)
+{
+ int rc = 0;
+ const char *dfs_host;
+ size_t dfs_host_len;
+
+ *target_match = true;
+ extract_unc_hostname(share, &dfs_host, &dfs_host_len);
+
+ /* Check if hostnames or addresses match */
+ cifs_server_lock(server);
+ if (dfs_host_len != strlen(server->hostname) ||
+ strncasecmp(dfs_host, server->hostname, dfs_host_len)) {
+ cifs_dbg(FYI, "%s: %.*s doesn't match %s\n", __func__,
+ (int)dfs_host_len, dfs_host, server->hostname);
+ rc = match_target_ip(server, dfs_host, dfs_host_len, target_match);
+ if (rc)
+ cifs_dbg(VFS, "%s: failed to match target ip: %d\n", __func__, rc);
+ }
+ cifs_server_unlock(server);
+ return rc;
+}
+
+static int __tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, char *tree, bool islink,
+ struct dfs_cache_tgt_list *tl)
+{
+ int rc;
+ struct TCP_Server_Info *server = tcon->ses->server;
+ const struct smb_version_operations *ops = server->ops;
+ struct cifs_ses *root_ses = CIFS_DFS_ROOT_SES(tcon->ses);
+ struct cifs_tcon *ipc = root_ses->tcon_ipc;
+ char *share = NULL, *prefix = NULL;
+ struct dfs_cache_tgt_iterator *tit;
+ bool target_match;
+
+ tit = dfs_cache_get_tgt_iterator(tl);
+ if (!tit) {
+ rc = -ENOENT;
+ goto out;
+ }
+
+ /* Try to tree connect to all dfs targets */
+ for (; tit; tit = dfs_cache_get_next_tgt(tl, tit)) {
+ const char *target = dfs_cache_get_tgt_name(tit);
+ struct dfs_cache_tgt_list ntl = DFS_CACHE_TGT_LIST_INIT(ntl);
+
+ kfree(share);
+ kfree(prefix);
+ share = prefix = NULL;
+
+ /* Check if share matches with tcp ses */
+ rc = dfs_cache_get_tgt_share(server->current_fullpath + 1, tit, &share, &prefix);
+ if (rc) {
+ cifs_dbg(VFS, "%s: failed to parse target share: %d\n", __func__, rc);
+ break;
+ }
+
+ rc = target_share_matches_server(server, share, &target_match);
+ if (rc)
+ break;
+ if (!target_match) {
+ rc = -EHOSTUNREACH;
+ continue;
+ }
+
+ dfs_cache_noreq_update_tgthint(server->current_fullpath + 1, tit);
+
+ if (ipc->need_reconnect) {
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
+ rc = ops->tree_connect(xid, ipc->ses, tree, ipc, cifs_sb->local_nls);
+ cifs_dbg(FYI, "%s: reconnect ipc: %d\n", __func__, rc);
+ }
+
+ scnprintf(tree, MAX_TREE_SIZE, "\\%s", share);
+ if (!islink) {
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
+ break;
+ }
+ /*
+ * If no dfs referrals were returned from link target, then just do a TREE_CONNECT
+ * to it. Otherwise, cache the dfs referral and then mark current tcp ses for
+ * reconnect so either the demultiplex thread or the echo worker will reconnect to
+ * newly resolved target.
+ */
+ if (dfs_cache_find(xid, root_ses, cifs_sb->local_nls, cifs_remap(cifs_sb), target,
+ NULL, &ntl)) {
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, cifs_sb->local_nls);
+ if (rc)
+ continue;
+
+ rc = cifs_update_super_prepath(cifs_sb, prefix);
+ } else {
+ /* Target is another dfs share */
+ rc = update_server_fullpath(server, cifs_sb, target);
+ dfs_cache_free_tgts(tl);
+
+ if (!rc) {
+ rc = -EREMOTE;
+ list_replace_init(&ntl.tl_list, &tl->tl_list);
+ } else
+ dfs_cache_free_tgts(&ntl);
+ }
+ break;
+ }
+
+out:
+ kfree(share);
+ kfree(prefix);
+
+ return rc;
+}
+
+static int tree_connect_dfs_target(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, char *tree, bool islink,
+ struct dfs_cache_tgt_list *tl)
+{
+ int rc;
+ int num_links = 0;
+ struct TCP_Server_Info *server = tcon->ses->server;
+ char *old_fullpath = server->leaf_fullpath;
+
+ do {
+ rc = __tree_connect_dfs_target(xid, tcon, cifs_sb, tree, islink, tl);
+ if (!rc || rc != -EREMOTE)
+ break;
+ } while (rc = -ELOOP, ++num_links < MAX_NESTED_LINKS);
+ /*
+ * If we couldn't tree connect to any targets from last referral path, then
+ * retry it from newly resolved dfs referral.
+ */
+ if (rc && server->leaf_fullpath != old_fullpath)
+ cifs_signal_cifsd_for_reconnect(server, true);
+
+ dfs_cache_free_tgts(tl);
+ return rc;
+}
+
+int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const struct nls_table *nlsc)
+{
+ int rc;
+ struct TCP_Server_Info *server = tcon->ses->server;
+ const struct smb_version_operations *ops = server->ops;
+ struct super_block *sb = NULL;
+ struct cifs_sb_info *cifs_sb;
+ struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
+ char *tree;
+ struct dfs_info3_param ref = {0};
+
+ /* only send once per connect */
+ spin_lock(&tcon->tc_lock);
+ if (tcon->ses->ses_status != SES_GOOD ||
+ (tcon->status != TID_NEW &&
+ tcon->status != TID_NEED_TCON)) {
+ spin_unlock(&tcon->tc_lock);
+ return 0;
+ }
+ tcon->status = TID_IN_TCON;
+ spin_unlock(&tcon->tc_lock);
+
+ tree = kzalloc(MAX_TREE_SIZE, GFP_KERNEL);
+ if (!tree) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if (tcon->ipc) {
+ cifs_server_lock(server);
+ scnprintf(tree, MAX_TREE_SIZE, "\\\\%s\\IPC$", server->hostname);
+ cifs_server_unlock(server);
+ rc = ops->tree_connect(xid, tcon->ses, tree, tcon, nlsc);
+ goto out;
+ }
+
+ sb = cifs_get_tcp_super(server);
+ if (IS_ERR(sb)) {
+ rc = PTR_ERR(sb);
+ cifs_dbg(VFS, "%s: could not find superblock: %d\n", __func__, rc);
+ goto out;
+ }
+
+ cifs_sb = CIFS_SB(sb);
+
+ /* If it is not dfs or there was no cached dfs referral, then reconnect to same share */
+ if (!server->current_fullpath ||
+ dfs_cache_noreq_find(server->current_fullpath + 1, &ref, &tl)) {
+ rc = ops->tree_connect(xid, tcon->ses, tcon->tree_name, tcon, cifs_sb->local_nls);
+ goto out;
+ }
+
+ rc = tree_connect_dfs_target(xid, tcon, cifs_sb, tree, ref.server_type == DFS_TYPE_LINK,
+ &tl);
+ free_dfs_info_param(&ref);
+
+out:
+ kfree(tree);
+ cifs_put_tcp_super(sb);
+
+ if (rc) {
+ spin_lock(&tcon->tc_lock);
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_NEED_TCON;
+ spin_unlock(&tcon->tc_lock);
+ } else {
+ spin_lock(&tcon->tc_lock);
+ if (tcon->status == TID_IN_TCON)
+ tcon->status = TID_GOOD;
+ spin_unlock(&tcon->tc_lock);
+ tcon->need_reconnect = false;
+ }
+
+ return rc;
+}
diff --git a/fs/cifs/dfs.h b/fs/cifs/dfs.h
new file mode 100644
index 000000000000..344bea6d8bab
--- /dev/null
+++ b/fs/cifs/dfs.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Paulo Alcantara <palcantara@suse.de>
+ */
+
+#ifndef _CIFS_DFS_H
+#define _CIFS_DFS_H
+
+#include "cifsglob.h"
+#include "fs_context.h"
+#include "cifs_unicode.h"
+
+int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref,
+ struct smb3_fs_context *ctx);
+int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs);
+
+static inline char *dfs_get_path(struct cifs_sb_info *cifs_sb, const char *path)
+{
+ return dfs_cache_canonical_path(path, cifs_sb->local_nls, cifs_remap(cifs_sb));
+}
+
+static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *path,
+ struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tl)
+{
+ struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+
+ return dfs_cache_find(mnt_ctx->xid, mnt_ctx->root_ses, cifs_sb->local_nls,
+ cifs_remap(cifs_sb), path, ref, tl);
+}
+
+static inline char *dfs_get_automount_devname(struct dentry *dentry, void *page)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ struct TCP_Server_Info *server = tcon->ses->server;
+
+ if (unlikely(!server->origin_fullpath))
+ return ERR_PTR(-EREMOTE);
+
+ return __build_path_from_dentry_optional_prefix(dentry, page,
+ server->origin_fullpath,
+ strlen(server->origin_fullpath),
+ true);
+}
+
+#endif /* _CIFS_DFS_H */
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index e70915ad7541..ac86bd0ebd63 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -83,27 +83,6 @@ static void refresh_cache_worker(struct work_struct *work);
static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker);
-static void get_ipc_unc(const char *ref_path, char *ipc, size_t ipclen)
-{
- const char *host;
- size_t len;
-
- extract_unc_hostname(ref_path, &host, &len);
- scnprintf(ipc, ipclen, "\\\\%.*s\\IPC$", (int)len, host);
-}
-
-static struct cifs_ses *find_ipc_from_server_path(struct cifs_ses **ses, const char *path)
-{
- char unc[SERVER_NAME_LENGTH + sizeof("//x/IPC$")] = {0};
-
- get_ipc_unc(path, unc, sizeof(unc));
- for (; *ses; ses++) {
- if (!strcasecmp(unc, (*ses)->tcon_ipc->tree_name))
- return *ses;
- }
- return ERR_PTR(-ENOENT);
-}
-
static void __mount_group_release(struct mount_group *mg)
{
int i;
@@ -290,7 +269,7 @@ static int dfscache_proc_show(struct seq_file *m, void *v)
list_for_each_entry(t, &ce->tlist, list) {
seq_printf(m, " %s%s\n",
t->name,
- ce->tgthint == t ? " (target hint)" : "");
+ READ_ONCE(ce->tgthint) == t ? " (target hint)" : "");
}
}
}
@@ -342,7 +321,7 @@ static inline void dump_tgts(const struct cache_entry *ce)
cifs_dbg(FYI, "target list:\n");
list_for_each_entry(t, &ce->tlist, list) {
cifs_dbg(FYI, " %s%s\n", t->name,
- ce->tgthint == t ? " (target hint)" : "");
+ READ_ONCE(ce->tgthint) == t ? " (target hint)" : "");
}
}
@@ -448,7 +427,7 @@ static int cache_entry_hash(const void *data, int size, unsigned int *hash)
/* Return target hint of a DFS cache entry */
static inline char *get_tgt_name(const struct cache_entry *ce)
{
- struct cache_dfs_tgt *t = ce->tgthint;
+ struct cache_dfs_tgt *t = READ_ONCE(ce->tgthint);
return t ? t->name : ERR_PTR(-ENOENT);
}
@@ -491,6 +470,7 @@ static struct cache_dfs_tgt *alloc_target(const char *name, int path_consumed)
static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
struct cache_entry *ce, const char *tgthint)
{
+ struct cache_dfs_tgt *target;
int i;
ce->ttl = max_t(int, refs[0].ttl, CACHE_MIN_TTL);
@@ -517,8 +497,9 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
ce->numtgts++;
}
- ce->tgthint = list_first_entry_or_null(&ce->tlist,
- struct cache_dfs_tgt, list);
+ target = list_first_entry_or_null(&ce->tlist, struct cache_dfs_tgt,
+ list);
+ WRITE_ONCE(ce->tgthint, target);
return 0;
}
@@ -579,7 +560,8 @@ static void remove_oldest_entry_locked(void)
}
/* Add a new DFS cache entry */
-static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs)
+static struct cache_entry *add_cache_entry_locked(struct dfs_info3_param *refs,
+ int numrefs)
{
int rc;
struct cache_entry *ce;
@@ -594,11 +576,11 @@ static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs)
rc = cache_entry_hash(refs[0].path_name, strlen(refs[0].path_name), &hash);
if (rc)
- return rc;
+ return ERR_PTR(rc);
ce = alloc_cache_entry(refs, numrefs);
if (IS_ERR(ce))
- return PTR_ERR(ce);
+ return ce;
spin_lock(&cache_ttl_lock);
if (!cache_ttl) {
@@ -615,7 +597,7 @@ static int add_cache_entry_locked(struct dfs_info3_param *refs, int numrefs)
atomic_inc(&cache_count);
- return 0;
+ return ce;
}
/* Check if two DFS paths are equal. @s1 and @s2 are expected to be in @cache_cp's charset */
@@ -662,7 +644,9 @@ static struct cache_entry *__lookup_cache_entry(const char *path, unsigned int h
*
* Use whole path components in the match. Must be called with htable_rw_lock held.
*
+ * Return cached entry if successful.
* Return ERR_PTR(-ENOENT) if the entry is not found.
+ * Return error ptr otherwise.
*/
static struct cache_entry *lookup_cache_entry(const char *path)
{
@@ -732,14 +716,15 @@ void dfs_cache_destroy(void)
static int update_cache_entry_locked(struct cache_entry *ce, const struct dfs_info3_param *refs,
int numrefs)
{
+ struct cache_dfs_tgt *target;
+ char *th = NULL;
int rc;
- char *s, *th = NULL;
WARN_ON(!rwsem_is_locked(&htable_rw_lock));
- if (ce->tgthint) {
- s = ce->tgthint->name;
- th = kstrdup(s, GFP_ATOMIC);
+ target = READ_ONCE(ce->tgthint);
+ if (target) {
+ th = kstrdup(target->name, GFP_ATOMIC);
if (!th)
return -ENOMEM;
}
@@ -760,8 +745,6 @@ static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const
int rc;
int i;
- cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path);
-
*refs = NULL;
*numrefs = 0;
@@ -770,6 +753,7 @@ static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const
if (unlikely(!cache_cp))
return -EINVAL;
+ cifs_dbg(FYI, "%s: ipc=%s referral=%s\n", __func__, ses->tcon_ipc->tree_name, path);
rc = ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, cache_cp,
NO_MAP_UNI_RSVD);
if (!rc) {
@@ -789,51 +773,75 @@ static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, const
*
* For interlinks, cifs_mount() and expand_dfs_referral() are supposed to
* handle them properly.
+ *
+ * On success, return entry with acquired lock for reading, otherwise error ptr.
*/
-static int cache_refresh_path(const unsigned int xid, struct cifs_ses *ses, const char *path)
+static struct cache_entry *cache_refresh_path(const unsigned int xid,
+ struct cifs_ses *ses,
+ const char *path,
+ bool force_refresh)
{
- int rc;
- struct cache_entry *ce;
struct dfs_info3_param *refs = NULL;
+ struct cache_entry *ce;
int numrefs = 0;
- bool newent = false;
+ int rc;
cifs_dbg(FYI, "%s: search path: %s\n", __func__, path);
- down_write(&htable_rw_lock);
+ down_read(&htable_rw_lock);
ce = lookup_cache_entry(path);
if (!IS_ERR(ce)) {
- if (!cache_entry_expired(ce)) {
- dump_ce(ce);
- up_write(&htable_rw_lock);
- return 0;
- }
- } else {
- newent = true;
+ if (!force_refresh && !cache_entry_expired(ce))
+ return ce;
+ } else if (PTR_ERR(ce) != -ENOENT) {
+ up_read(&htable_rw_lock);
+ return ce;
}
/*
- * Either the entry was not found, or it is expired.
+ * Unlock shared access as we don't want to hold any locks while getting
+ * a new referral. The @ses used for performing the I/O could be
+ * reconnecting and it acquires @htable_rw_lock to look up the dfs cache
+ * in order to failover -- if necessary.
+ */
+ up_read(&htable_rw_lock);
+
+ /*
+ * Either the entry was not found, or it is expired, or it is a forced
+ * refresh.
* Request a new DFS referral in order to create or update a cache entry.
*/
rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
- if (rc)
- goto out_unlock;
+ if (rc) {
+ ce = ERR_PTR(rc);
+ goto out;
+ }
dump_refs(refs, numrefs);
- if (!newent) {
- rc = update_cache_entry_locked(ce, refs, numrefs);
- goto out_unlock;
+ down_write(&htable_rw_lock);
+ /* Re-check as another task might have it added or refreshed already */
+ ce = lookup_cache_entry(path);
+ if (!IS_ERR(ce)) {
+ if (force_refresh || cache_entry_expired(ce)) {
+ rc = update_cache_entry_locked(ce, refs, numrefs);
+ if (rc)
+ ce = ERR_PTR(rc);
+ }
+ } else if (PTR_ERR(ce) == -ENOENT) {
+ ce = add_cache_entry_locked(refs, numrefs);
}
- rc = add_cache_entry_locked(refs, numrefs);
+ if (IS_ERR(ce)) {
+ up_write(&htable_rw_lock);
+ goto out;
+ }
-out_unlock:
- up_write(&htable_rw_lock);
+ downgrade_write(&htable_rw_lock);
+out:
free_dfs_info_array(refs, numrefs);
- return rc;
+ return ce;
}
/*
@@ -900,7 +908,7 @@ static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl)
}
it->it_path_consumed = t->path_consumed;
- if (ce->tgthint == t)
+ if (READ_ONCE(ce->tgthint) == t)
list_add(&it->it_list, head);
else
list_add_tail(&it->it_list, head);
@@ -953,15 +961,8 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nl
if (IS_ERR(npath))
return PTR_ERR(npath);
- rc = cache_refresh_path(xid, ses, npath);
- if (rc)
- goto out_free_path;
-
- down_read(&htable_rw_lock);
-
- ce = lookup_cache_entry(npath);
+ ce = cache_refresh_path(xid, ses, npath, false);
if (IS_ERR(ce)) {
- up_read(&htable_rw_lock);
rc = PTR_ERR(ce);
goto out_free_path;
}
@@ -1025,72 +1026,6 @@ out_unlock:
}
/**
- * dfs_cache_update_tgthint - update target hint of a DFS cache entry
- *
- * If it doesn't find the cache entry, then it will get a DFS referral for @path
- * and create a new entry.
- *
- * In case the cache entry exists but expired, it will get a DFS referral
- * for @path and then update the respective cache entry.
- *
- * @xid: syscall id
- * @ses: smb session
- * @cp: codepage
- * @remap: type of character remapping for paths
- * @path: path to lookup in DFS referral cache
- * @it: DFS target iterator
- *
- * Return zero if the target hint was updated successfully, otherwise non-zero.
- */
-int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *cp, int remap, const char *path,
- const struct dfs_cache_tgt_iterator *it)
-{
- int rc;
- const char *npath;
- struct cache_entry *ce;
- struct cache_dfs_tgt *t;
-
- npath = dfs_cache_canonical_path(path, cp, remap);
- if (IS_ERR(npath))
- return PTR_ERR(npath);
-
- cifs_dbg(FYI, "%s: update target hint - path: %s\n", __func__, npath);
-
- rc = cache_refresh_path(xid, ses, npath);
- if (rc)
- goto out_free_path;
-
- down_write(&htable_rw_lock);
-
- ce = lookup_cache_entry(npath);
- if (IS_ERR(ce)) {
- rc = PTR_ERR(ce);
- goto out_unlock;
- }
-
- t = ce->tgthint;
-
- if (likely(!strcasecmp(it->it_name, t->name)))
- goto out_unlock;
-
- list_for_each_entry(t, &ce->tlist, list) {
- if (!strcasecmp(t->name, it->it_name)) {
- ce->tgthint = t;
- cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
- it->it_name);
- break;
- }
- }
-
-out_unlock:
- up_write(&htable_rw_lock);
-out_free_path:
- kfree(npath);
- return rc;
-}
-
-/**
* dfs_cache_noreq_update_tgthint - update target hint of a DFS cache entry
* without sending any requests to the currently connected server.
*
@@ -1104,34 +1039,30 @@ out_free_path:
*
* Return zero if the target hint was updated successfully, otherwise non-zero.
*/
-int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it)
+void dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it)
{
- int rc;
- struct cache_entry *ce;
struct cache_dfs_tgt *t;
+ struct cache_entry *ce;
- if (!it)
- return -EINVAL;
+ if (!path || !it)
+ return;
cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
- down_write(&htable_rw_lock);
+ down_read(&htable_rw_lock);
ce = lookup_cache_entry(path);
- if (IS_ERR(ce)) {
- rc = PTR_ERR(ce);
+ if (IS_ERR(ce))
goto out_unlock;
- }
- rc = 0;
- t = ce->tgthint;
+ t = READ_ONCE(ce->tgthint);
if (unlikely(!strcasecmp(it->it_name, t->name)))
goto out_unlock;
list_for_each_entry(t, &ce->tlist, list) {
if (!strcasecmp(t->name, it->it_name)) {
- ce->tgthint = t;
+ WRITE_ONCE(ce->tgthint, t);
cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
it->it_name);
break;
@@ -1139,8 +1070,7 @@ int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_
}
out_unlock:
- up_write(&htable_rw_lock);
- return rc;
+ up_read(&htable_rw_lock);
}
/**
@@ -1314,8 +1244,7 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c
char unc[sizeof("\\\\") + SERVER_NAME_LENGTH] = {0};
const char *host;
size_t hostlen;
- char *ip = NULL;
- struct sockaddr sa;
+ struct sockaddr_storage ss;
bool match;
int rc;
@@ -1326,27 +1255,20 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c
* Resolve share's hostname and check if server address matches. Otherwise just ignore it
* as we could not have upcall to resolve hostname or failed to convert ip address.
*/
- match = true;
extract_unc_hostname(s1, &host, &hostlen);
scnprintf(unc, sizeof(unc), "\\\\%.*s", (int)hostlen, host);
- rc = dns_resolve_server_name_to_ip(unc, &ip, NULL);
+ rc = dns_resolve_server_name_to_ip(unc, (struct sockaddr *)&ss, NULL);
if (rc < 0) {
cifs_dbg(FYI, "%s: could not resolve %.*s. assuming server address matches.\n",
__func__, (int)hostlen, host);
return true;
}
- if (!cifs_convert_address(&sa, ip, strlen(ip))) {
- cifs_dbg(VFS, "%s: failed to convert address \'%s\'. skip address matching.\n",
- __func__, ip);
- } else {
- cifs_server_lock(server);
- match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, &sa);
- cifs_server_unlock(server);
- }
+ cifs_server_lock(server);
+ match = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss);
+ cifs_server_unlock(server);
- kfree(ip);
return match;
}
@@ -1354,50 +1276,47 @@ static bool target_share_equal(struct TCP_Server_Info *server, const char *s1, c
* Mark dfs tcon for reconnecting when the currently connected tcon does not match any of the new
* target shares in @refs.
*/
-static void mark_for_reconnect_if_needed(struct cifs_tcon *tcon, struct dfs_cache_tgt_list *tl,
- const struct dfs_info3_param *refs, int numrefs)
+static void mark_for_reconnect_if_needed(struct TCP_Server_Info *server,
+ struct dfs_cache_tgt_list *old_tl,
+ struct dfs_cache_tgt_list *new_tl)
{
- struct dfs_cache_tgt_iterator *it;
- int i;
-
- for (it = dfs_cache_get_tgt_iterator(tl); it; it = dfs_cache_get_next_tgt(tl, it)) {
- for (i = 0; i < numrefs; i++) {
- if (target_share_equal(tcon->ses->server, dfs_cache_get_tgt_name(it),
- refs[i].node_name))
+ struct dfs_cache_tgt_iterator *oit, *nit;
+
+ for (oit = dfs_cache_get_tgt_iterator(old_tl); oit;
+ oit = dfs_cache_get_next_tgt(old_tl, oit)) {
+ for (nit = dfs_cache_get_tgt_iterator(new_tl); nit;
+ nit = dfs_cache_get_next_tgt(new_tl, nit)) {
+ if (target_share_equal(server,
+ dfs_cache_get_tgt_name(oit),
+ dfs_cache_get_tgt_name(nit)))
return;
}
}
cifs_dbg(FYI, "%s: no cached or matched targets. mark dfs share for reconnect.\n", __func__);
- cifs_signal_cifsd_for_reconnect(tcon->ses->server, true);
+ cifs_signal_cifsd_for_reconnect(server, true);
}
/* Refresh dfs referral of tcon and mark it for reconnect if needed */
-static int __refresh_tcon(const char *path, struct cifs_ses **sessions, struct cifs_tcon *tcon,
- bool force_refresh)
+static int __refresh_tcon(const char *path, struct cifs_tcon *tcon, bool force_refresh)
{
- struct cifs_ses *ses;
- struct cache_entry *ce;
- struct dfs_info3_param *refs = NULL;
- int numrefs = 0;
+ struct dfs_cache_tgt_list old_tl = DFS_CACHE_TGT_LIST_INIT(old_tl);
+ struct dfs_cache_tgt_list new_tl = DFS_CACHE_TGT_LIST_INIT(new_tl);
+ struct cifs_ses *ses = CIFS_DFS_ROOT_SES(tcon->ses);
+ struct cifs_tcon *ipc = ses->tcon_ipc;
bool needs_refresh = false;
- struct dfs_cache_tgt_list tl = DFS_CACHE_TGT_LIST_INIT(tl);
- int rc = 0;
+ struct cache_entry *ce;
unsigned int xid;
+ int rc = 0;
- ses = find_ipc_from_server_path(sessions, path);
- if (IS_ERR(ses)) {
- cifs_dbg(FYI, "%s: could not find ipc session\n", __func__);
- return PTR_ERR(ses);
- }
+ xid = get_xid();
down_read(&htable_rw_lock);
ce = lookup_cache_entry(path);
needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce);
if (!IS_ERR(ce)) {
- rc = get_targets(ce, &tl);
- if (rc)
- cifs_dbg(FYI, "%s: could not get dfs targets: %d\n", __func__, rc);
+ rc = get_targets(ce, &old_tl);
+ cifs_dbg(FYI, "%s: get_targets: %d\n", __func__, rc);
}
up_read(&htable_rw_lock);
@@ -1406,44 +1325,37 @@ static int __refresh_tcon(const char *path, struct cifs_ses **sessions, struct c
goto out;
}
- xid = get_xid();
- rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
- free_xid(xid);
-
- /* Create or update a cache entry with the new referral */
- if (!rc) {
- dump_refs(refs, numrefs);
-
- down_write(&htable_rw_lock);
- ce = lookup_cache_entry(path);
- if (IS_ERR(ce))
- add_cache_entry_locked(refs, numrefs);
- else if (force_refresh || cache_entry_expired(ce))
- update_cache_entry_locked(ce, refs, numrefs);
- up_write(&htable_rw_lock);
+ spin_lock(&ipc->tc_lock);
+ if (ses->ses_status != SES_GOOD || ipc->status != TID_GOOD) {
+ spin_unlock(&ipc->tc_lock);
+ cifs_dbg(FYI, "%s: skip cache refresh due to disconnected ipc\n", __func__);
+ goto out;
+ }
+ spin_unlock(&ipc->tc_lock);
- mark_for_reconnect_if_needed(tcon, &tl, refs, numrefs);
+ ce = cache_refresh_path(xid, ses, path, true);
+ if (!IS_ERR(ce)) {
+ rc = get_targets(ce, &new_tl);
+ up_read(&htable_rw_lock);
+ cifs_dbg(FYI, "%s: get_targets: %d\n", __func__, rc);
+ mark_for_reconnect_if_needed(tcon->ses->server, &old_tl, &new_tl);
}
out:
- dfs_cache_free_tgts(&tl);
- free_dfs_info_array(refs, numrefs);
+ free_xid(xid);
+ dfs_cache_free_tgts(&old_tl);
+ dfs_cache_free_tgts(&new_tl);
return rc;
}
-static int refresh_tcon(struct cifs_ses **sessions, struct cifs_tcon *tcon, bool force_refresh)
+static int refresh_tcon(struct cifs_tcon *tcon, bool force_refresh)
{
struct TCP_Server_Info *server = tcon->ses->server;
mutex_lock(&server->refpath_lock);
- if (server->origin_fullpath) {
- if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
- server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, force_refresh);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, force_refresh);
- }
+ if (server->leaf_fullpath)
+ __refresh_tcon(server->leaf_fullpath + 1, tcon, force_refresh);
mutex_unlock(&server->refpath_lock);
-
return 0;
}
@@ -1461,9 +1373,6 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
{
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
- struct mount_group *mg;
- struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL};
- int rc;
if (!cifs_sb || !cifs_sb->master_tlink)
return -EINVAL;
@@ -1480,21 +1389,6 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
cifs_dbg(FYI, "%s: no dfs mount group id\n", __func__);
return -EINVAL;
}
-
- mutex_lock(&mount_group_list_lock);
- mg = find_mount_group_locked(&cifs_sb->dfs_mount_id);
- if (IS_ERR(mg)) {
- mutex_unlock(&mount_group_list_lock);
- cifs_dbg(FYI, "%s: no ipc session for refreshing referral\n", __func__);
- return PTR_ERR(mg);
- }
- kref_get(&mg->refcount);
- mutex_unlock(&mount_group_list_lock);
-
- spin_lock(&mg->lock);
- memcpy(&sessions, mg->sessions, mg->num_sessions * sizeof(mg->sessions[0]));
- spin_unlock(&mg->lock);
-
/*
* After reconnecting to a different server, unique ids won't match anymore, so we disable
* serverino. This prevents dentry revalidation to think the dentry are stale (ESTALE).
@@ -1505,42 +1399,38 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
* that have different prefix paths.
*/
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
- rc = refresh_tcon(sessions, tcon, true);
- kref_put(&mg->refcount, mount_group_release);
- return rc;
+ return refresh_tcon(tcon, true);
}
/*
- * Refresh all active dfs mounts regardless of whether they are in cache or not.
- * (cache can be cleared)
+ * Worker that will refresh DFS cache from all active mounts based on lowest TTL value
+ * from a DFS referral.
*/
-static void refresh_mounts(struct cifs_ses **sessions)
+static void refresh_cache_worker(struct work_struct *work)
{
struct TCP_Server_Info *server;
- struct cifs_ses *ses;
struct cifs_tcon *tcon, *ntcon;
struct list_head tcons;
+ struct cifs_ses *ses;
INIT_LIST_HEAD(&tcons);
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- spin_lock(&server->srv_lock);
- if (!server->is_dfs_conn) {
- spin_unlock(&server->srv_lock);
+ if (!server->leaf_fullpath)
continue;
- }
- spin_unlock(&server->srv_lock);
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (ses->tcon_ipc) {
+ ses->ses_count++;
+ list_add_tail(&ses->tcon_ipc->ulist, &tcons);
+ }
list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
- spin_lock(&tcon->tc_lock);
- if (!tcon->ipc && !tcon->need_reconnect) {
+ if (!tcon->ipc) {
tcon->tc_count++;
list_add_tail(&tcon->ulist, &tcons);
}
- spin_unlock(&tcon->tc_lock);
}
}
}
@@ -1552,132 +1442,14 @@ static void refresh_mounts(struct cifs_ses **sessions)
list_del_init(&tcon->ulist);
mutex_lock(&server->refpath_lock);
- if (server->origin_fullpath) {
- if (server->leaf_fullpath && strcasecmp(server->leaf_fullpath,
- server->origin_fullpath))
- __refresh_tcon(server->leaf_fullpath + 1, sessions, tcon, false);
- __refresh_tcon(server->origin_fullpath + 1, sessions, tcon, false);
- }
+ if (server->leaf_fullpath)
+ __refresh_tcon(server->leaf_fullpath + 1, tcon, false);
mutex_unlock(&server->refpath_lock);
- cifs_put_tcon(tcon);
- }
-}
-
-static void refresh_cache(struct cifs_ses **sessions)
-{
- int i;
- struct cifs_ses *ses;
- unsigned int xid;
- char *ref_paths[CACHE_MAX_ENTRIES];
- int count = 0;
- struct cache_entry *ce;
-
- /*
- * Refresh all cached entries. Get all new referrals outside critical section to avoid
- * starvation while performing SMB2 IOCTL on broken or slow connections.
-
- * The cache entries may cover more paths than the active mounts
- * (e.g. domain-based DFS referrals or multi tier DFS setups).
- */
- down_read(&htable_rw_lock);
- for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
- struct hlist_head *l = &cache_htable[i];
-
- hlist_for_each_entry(ce, l, hlist) {
- if (count == ARRAY_SIZE(ref_paths))
- goto out_unlock;
- if (hlist_unhashed(&ce->hlist) || !cache_entry_expired(ce) ||
- IS_ERR(find_ipc_from_server_path(sessions, ce->path)))
- continue;
- ref_paths[count++] = kstrdup(ce->path, GFP_ATOMIC);
- }
- }
-
-out_unlock:
- up_read(&htable_rw_lock);
-
- for (i = 0; i < count; i++) {
- char *path = ref_paths[i];
- struct dfs_info3_param *refs = NULL;
- int numrefs = 0;
- int rc = 0;
-
- if (!path)
- continue;
-
- ses = find_ipc_from_server_path(sessions, path);
- if (IS_ERR(ses))
- goto next_referral;
-
- xid = get_xid();
- rc = get_dfs_referral(xid, ses, path, &refs, &numrefs);
- free_xid(xid);
-
- if (!rc) {
- down_write(&htable_rw_lock);
- ce = lookup_cache_entry(path);
- /*
- * We need to re-check it because other tasks might have it deleted or
- * updated.
- */
- if (!IS_ERR(ce) && cache_entry_expired(ce))
- update_cache_entry_locked(ce, refs, numrefs);
- up_write(&htable_rw_lock);
- }
-
-next_referral:
- kfree(path);
- free_dfs_info_array(refs, numrefs);
- }
-}
-
-/*
- * Worker that will refresh DFS cache and active mounts based on lowest TTL value from a DFS
- * referral.
- */
-static void refresh_cache_worker(struct work_struct *work)
-{
- struct list_head mglist;
- struct mount_group *mg, *tmp_mg;
- struct cifs_ses *sessions[CACHE_MAX_ENTRIES + 1] = {NULL};
- int max_sessions = ARRAY_SIZE(sessions) - 1;
- int i = 0, count;
-
- INIT_LIST_HEAD(&mglist);
-
- /* Get refereces of mount groups */
- mutex_lock(&mount_group_list_lock);
- list_for_each_entry(mg, &mount_group_list, list) {
- kref_get(&mg->refcount);
- list_add(&mg->refresh_list, &mglist);
- }
- mutex_unlock(&mount_group_list_lock);
-
- /* Fill in local array with an NULL-terminated list of all referral server sessions */
- list_for_each_entry(mg, &mglist, refresh_list) {
- if (i >= max_sessions)
- break;
-
- spin_lock(&mg->lock);
- if (i + mg->num_sessions > max_sessions)
- count = max_sessions - i;
+ if (tcon->ipc)
+ cifs_put_smb_ses(tcon->ses);
else
- count = mg->num_sessions;
- memcpy(&sessions[i], mg->sessions, count * sizeof(mg->sessions[0]));
- spin_unlock(&mg->lock);
- i += count;
- }
-
- if (sessions[0]) {
- /* Refresh all active mounts and cached entries */
- refresh_mounts(sessions);
- refresh_cache(sessions);
- }
-
- list_for_each_entry_safe(mg, tmp_mg, &mglist, refresh_list) {
- list_del_init(&mg->refresh_list);
- kref_put(&mg->refcount, mount_group_release);
+ cifs_put_tcon(tcon);
}
spin_lock(&cache_ttl_lock);
diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h
index 52070d1df189..be3b5a44cf82 100644
--- a/fs/cifs/dfs_cache.h
+++ b/fs/cifs/dfs_cache.h
@@ -35,10 +35,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nl
struct dfs_cache_tgt_list *tgt_list);
int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
struct dfs_cache_tgt_list *tgt_list);
-int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
- const struct nls_table *cp, int remap, const char *path,
- const struct dfs_cache_tgt_iterator *it);
-int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it);
+void dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it);
int dfs_cache_get_tgt_referral(const char *path, const struct dfs_cache_tgt_iterator *it,
struct dfs_info3_param *ref);
int dfs_cache_get_tgt_share(char *path, const struct dfs_cache_tgt_iterator *it, char **share,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 8b1c37158556..ad4208bf1e32 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -78,14 +78,13 @@ build_path_from_dentry(struct dentry *direntry, void *page)
prefix);
}
-char *
-build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
- bool prefix)
+char *__build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ const char *tree, int tree_len,
+ bool prefix)
{
int dfsplen;
int pplen = 0;
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
char dirsep = CIFS_DIR_SEP(cifs_sb);
char *s;
@@ -93,7 +92,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
return ERR_PTR(-ENOMEM);
if (prefix)
- dfsplen = strnlen(tcon->tree_name, MAX_TREE_SIZE + 1);
+ dfsplen = strnlen(tree, tree_len + 1);
else
dfsplen = 0;
@@ -123,7 +122,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
}
if (dfsplen) {
s -= dfsplen;
- memcpy(s, tcon->tree_name, dfsplen);
+ memcpy(s, tree, dfsplen);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
int i;
for (i = 0; i < dfsplen; i++) {
@@ -135,6 +134,16 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
return s;
}
+char *build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page,
+ bool prefix)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
+ struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+
+ return __build_path_from_dentry_optional_prefix(direntry, page, tcon->tree_name,
+ MAX_TREE_SIZE, prefix);
+}
+
/*
* Don't allow path components longer than the server max.
* Don't allow the separator character in a path component.
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 0458d28d71aa..8bf8978bc5d6 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -12,6 +12,7 @@
*
*/
+#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/dns_resolver.h>
#include "dns_resolve.h"
@@ -25,17 +26,13 @@
* @ip_addr: Where to return the IP address.
* @expiry: Where to return the expiry time for the dns record.
*
- * The IP address will be returned in string form, and the caller is
- * responsible for freeing it.
- *
- * Returns length of result on success, -ve on error.
+ * Returns zero success, -ve on error.
*/
int
-dns_resolve_server_name_to_ip(const char *unc, char **ip_addr, time64_t *expiry)
+dns_resolve_server_name_to_ip(const char *unc, struct sockaddr *ip_addr, time64_t *expiry)
{
- struct sockaddr_storage ss;
const char *hostname, *sep;
- char *name;
+ char *ip;
int len, rc;
if (!ip_addr || !unc)
@@ -60,30 +57,32 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr, time64_t *expiry)
__func__, unc);
/* Try to interpret hostname as an IPv4 or IPv6 address */
- rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len);
- if (rc > 0)
- goto name_is_IP_address;
+ rc = cifs_convert_address(ip_addr, hostname, len);
+ if (rc > 0) {
+ cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %*.*s\n", __func__, len, len,
+ hostname);
+ return 0;
+ }
/* Perform the upcall */
rc = dns_query(current->nsproxy->net_ns, NULL, hostname, len,
- NULL, ip_addr, expiry, false);
- if (rc < 0)
+ NULL, &ip, expiry, false);
+ if (rc < 0) {
cifs_dbg(FYI, "%s: unable to resolve: %*.*s\n",
__func__, len, len, hostname);
- else
+ } else {
cifs_dbg(FYI, "%s: resolved: %*.*s to %s expiry %llu\n",
- __func__, len, len, hostname, *ip_addr,
+ __func__, len, len, hostname, ip,
expiry ? (*expiry) : 0);
- return rc;
-name_is_IP_address:
- name = kmalloc(len + 1, GFP_KERNEL);
- if (!name)
- return -ENOMEM;
- memcpy(name, hostname, len);
- name[len] = 0;
- cifs_dbg(FYI, "%s: unc is IP, skipping dns upcall: %s\n",
- __func__, name);
- *ip_addr = name;
- return 0;
+ rc = cifs_convert_address(ip_addr, ip, strlen(ip));
+ kfree(ip);
+
+ if (!rc) {
+ cifs_dbg(FYI, "%s: unable to determine ip address\n", __func__);
+ rc = -EHOSTUNREACH;
+ } else
+ rc = 0;
+ }
+ return rc;
}
diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h
index afc0df381246..6eb0c15a2440 100644
--- a/fs/cifs/dns_resolve.h
+++ b/fs/cifs/dns_resolve.h
@@ -11,8 +11,10 @@
#ifndef _DNS_RESOLVE_H
#define _DNS_RESOLVE_H
+#include <linux/net.h>
+
#ifdef __KERNEL__
-extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr, time64_t *expiry);
+int dns_resolve_server_name_to_ip(const char *unc, struct sockaddr *ip_addr, time64_t *expiry);
#endif /* KERNEL */
#endif /* _DNS_RESOLVE_H */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 87b56b1ae117..22dfc1f8b4f1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2646,6 +2646,21 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
return rc;
}
+static int
+cifs_writepage_locked(struct page *page, struct writeback_control *wbc);
+
+static int cifs_write_one_page(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret;
+
+ ret = cifs_writepage_locked(page, wbc);
+ unlock_page(page);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
static int cifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
@@ -2662,10 +2677,11 @@ static int cifs_writepages(struct address_space *mapping,
/*
* If wsize is smaller than the page cache size, default to writing
- * one page at a time via cifs_writepage
+ * one page at a time.
*/
if (cifs_sb->ctx->wsize < PAGE_SIZE)
- return generic_writepages(mapping, wbc);
+ return write_cache_pages(mapping, wbc, cifs_write_one_page,
+ mapping);
xid = get_xid();
if (wbc->range_cyclic) {
@@ -2852,13 +2868,6 @@ retry_write:
return rc;
}
-static int cifs_writepage(struct page *page, struct writeback_control *wbc)
-{
- int rc = cifs_writepage_locked(page, wbc);
- unlock_page(page);
- return rc;
-}
-
static int cifs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
@@ -5231,7 +5240,6 @@ static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
const struct address_space_operations cifs_addr_ops = {
.read_folio = cifs_read_folio,
.readahead = cifs_readahead,
- .writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
@@ -5240,10 +5248,10 @@ const struct address_space_operations cifs_addr_ops = {
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
+ .migrate_folio = filemap_migrate_folio,
/*
- * TODO: investigate and if useful we could add an cifs_migratePage
- * helper (under an CONFIG_MIGRATION) in the future, and also
- * investigate and add an is_dirty_writeback helper if needed
+ * TODO: investigate and if useful we could add an is_dirty_writeback
+ * helper if needed
*/
.swap_activate = cifs_swap_activate,
.swap_deactivate = cifs_swap_deactivate,
@@ -5256,7 +5264,6 @@ const struct address_space_operations cifs_addr_ops = {
*/
const struct address_space_operations cifs_addr_ops_smallbuf = {
.read_folio = cifs_read_folio,
- .writepage = cifs_writepage,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
@@ -5264,4 +5271,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
+ .migrate_folio = filemap_migrate_folio,
};
diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c
index 45119597c765..6d13f8207e96 100644
--- a/fs/cifs/fs_context.c
+++ b/fs/cifs/fs_context.c
@@ -308,7 +308,6 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
{
memcpy(new_ctx, ctx, sizeof(*ctx));
new_ctx->prepath = NULL;
- new_ctx->mount_options = NULL;
new_ctx->nodename = NULL;
new_ctx->username = NULL;
new_ctx->password = NULL;
@@ -317,11 +316,11 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->UNC = NULL;
new_ctx->source = NULL;
new_ctx->iocharset = NULL;
+ new_ctx->leaf_fullpath = NULL;
/*
* Make sure to stay in sync with smb3_cleanup_fs_context_contents()
*/
DUP_CTX_STR(prepath);
- DUP_CTX_STR(mount_options);
DUP_CTX_STR(username);
DUP_CTX_STR(password);
DUP_CTX_STR(server_hostname);
@@ -330,6 +329,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(domainname);
DUP_CTX_STR(nodename);
DUP_CTX_STR(iocharset);
+ DUP_CTX_STR(leaf_fullpath);
return 0;
}
@@ -569,17 +569,12 @@ static const struct fs_context_operations smb3_fs_context_ops = {
static int smb3_fs_context_parse_monolithic(struct fs_context *fc,
void *data)
{
- struct smb3_fs_context *ctx = smb3_fc2context(fc);
char *options = data, *key;
int ret = 0;
if (!options)
return 0;
- ctx->mount_options = kstrdup(data, GFP_KERNEL);
- if (ctx->mount_options == NULL)
- return -ENOMEM;
-
ret = security_sb_eat_lsm_opts(options, &fc->security);
if (ret)
return ret;
@@ -884,16 +879,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->nodfs = 1;
break;
case Opt_hard:
- if (result.negated)
+ if (result.negated) {
+ if (ctx->retry == 1)
+ cifs_dbg(VFS, "conflicting hard vs. soft mount options\n");
ctx->retry = 0;
- else
+ } else
ctx->retry = 1;
break;
case Opt_soft:
if (result.negated)
ctx->retry = 1;
- else
+ else {
+ if (ctx->retry == 1)
+ cifs_dbg(VFS, "conflicting hard vs soft mount options\n");
ctx->retry = 0;
+ }
break;
case Opt_mapposix:
if (result.negated)
@@ -1576,8 +1576,6 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
/*
* Make sure this stays in sync with smb3_fs_context_dup()
*/
- kfree(ctx->mount_options);
- ctx->mount_options = NULL;
kfree(ctx->username);
ctx->username = NULL;
kfree_sensitive(ctx->password);
@@ -1596,6 +1594,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->iocharset = NULL;
kfree(ctx->prepath);
ctx->prepath = NULL;
+ kfree(ctx->leaf_fullpath);
+ ctx->leaf_fullpath = NULL;
}
void
diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h
index bbaee4c2281f..44cb5639ed3b 100644
--- a/fs/cifs/fs_context.h
+++ b/fs/cifs/fs_context.h
@@ -264,8 +264,7 @@ struct smb3_fs_context {
__u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
bool rootfs:1; /* if it's a SMB root file system */
bool witness:1; /* use witness protocol */
-
- char *mount_options;
+ char *leaf_fullpath;
};
extern const struct fs_parameter_spec smb3_fs_parameters[];
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4e2ca3c6e5c0..f145a59af89b 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -632,6 +632,8 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from POSIX info struct */
static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
struct super_block *sb, bool adjust_tz, bool symlink)
{
struct smb311_posix_qinfo *info = &data->posix_fi;
@@ -680,8 +682,8 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr, struct cifs_ope
}
/* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */
- fattr->cf_uid = cifs_sb->ctx->linux_uid; /* TODO: map uid and gid from SID */
- fattr->cf_gid = cifs_sb->ctx->linux_gid;
+ sid_to_id(cifs_sb, owner, fattr, SIDOWNER);
+ sid_to_id(cifs_sb, group, fattr, SIDGROUP);
cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
@@ -991,12 +993,6 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
}
rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path, &tmp_data,
&adjust_tz, &is_reparse_point);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- if (rc == -ENOENT && is_tcon_dfs(tcon))
- rc = cifs_dfs_query_info_nonascii_quirk(xid, tcon,
- cifs_sb,
- full_path);
-#endif
data = &tmp_data;
}
@@ -1175,6 +1171,7 @@ smb311_posix_get_inode_info(struct inode **inode,
struct cifs_fattr fattr = {0};
bool symlink = false;
struct cifs_open_info_data data = {};
+ struct cifs_sid owner, group;
int rc = 0;
int tmprc = 0;
@@ -1192,7 +1189,8 @@ smb311_posix_get_inode_info(struct inode **inode,
goto out;
}
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data, &adjust_tz,
+ rc = smb311_posix_query_path_info(xid, tcon, cifs_sb, full_path, &data,
+ &owner, &group, &adjust_tz,
&symlink);
/*
@@ -1201,7 +1199,8 @@ smb311_posix_get_inode_info(struct inode **inode,
switch (rc) {
case 0:
- smb311_posix_info_to_fattr(&fattr, &data, sb, adjust_tz, symlink);
+ smb311_posix_info_to_fattr(&fattr, &data, &owner, &group,
+ sb, adjust_tz, symlink);
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index bd374feeccaa..a5a097a69983 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -428,6 +428,7 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
oparms.disposition = FILE_CREATE;
oparms.fid = &fid;
oparms.reconnect = false;
+ oparms.mode = 0644;
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
NULL, NULL);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3e68d8208cf5..2a19c7987c5b 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1136,8 +1136,8 @@ cifs_free_hash(struct shash_desc **sdesc)
* @len: Where to store the length for this page:
* @offset: Where to store the offset for this page
*/
-void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset)
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+ unsigned int *len, unsigned int *offset)
{
*len = rqst->rq_pagesz;
*offset = (page == 0) ? rqst->rq_offset : 0;
@@ -1258,44 +1258,30 @@ int match_target_ip(struct TCP_Server_Info *server,
bool *result)
{
int rc;
- char *target, *tip = NULL;
- struct sockaddr tipaddr;
+ char *target;
+ struct sockaddr_storage ss;
*result = false;
target = kzalloc(share_len + 3, GFP_KERNEL);
- if (!target) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!target)
+ return -ENOMEM;
scnprintf(target, share_len + 3, "\\\\%.*s", (int)share_len, share);
cifs_dbg(FYI, "%s: target name: %s\n", __func__, target + 2);
- rc = dns_resolve_server_name_to_ip(target, &tip, NULL);
- if (rc < 0)
- goto out;
-
- cifs_dbg(FYI, "%s: target ip: %s\n", __func__, tip);
+ rc = dns_resolve_server_name_to_ip(target, (struct sockaddr *)&ss, NULL);
+ kfree(target);
- if (!cifs_convert_address(&tipaddr, tip, strlen(tip))) {
- cifs_dbg(VFS, "%s: failed to convert target ip address\n",
- __func__);
- rc = -EINVAL;
- goto out;
- }
+ if (rc < 0)
+ return rc;
- *result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr,
- &tipaddr);
+ spin_lock(&server->srv_lock);
+ *result = cifs_match_ipaddr((struct sockaddr *)&server->dstaddr, (struct sockaddr *)&ss);
+ spin_unlock(&server->srv_lock);
cifs_dbg(FYI, "%s: ip addresses match: %u\n", __func__, *result);
- rc = 0;
-
-out:
- kfree(target);
- kfree(tip);
-
- return rc;
+ return 0;
}
int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
@@ -1314,49 +1300,4 @@ int cifs_update_super_prepath(struct cifs_sb_info *cifs_sb, char *prefix)
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
return 0;
}
-
-/** cifs_dfs_query_info_nonascii_quirk
- * Handle weird Windows SMB server behaviour. It responds with
- * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
- * for "\<server>\<dfsname>\<linkpath>" DFS reference,
- * where <dfsname> contains non-ASCII unicode symbols.
- *
- * Check such DFS reference.
- */
-int cifs_dfs_query_info_nonascii_quirk(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *linkpath)
-{
- char *treename, *dfspath, sep;
- int treenamelen, linkpathlen, rc;
-
- treename = tcon->tree_name;
- /* MS-DFSC: All paths in REQ_GET_DFS_REFERRAL and RESP_GET_DFS_REFERRAL
- * messages MUST be encoded with exactly one leading backslash, not two
- * leading backslashes.
- */
- sep = CIFS_DIR_SEP(cifs_sb);
- if (treename[0] == sep && treename[1] == sep)
- treename++;
- linkpathlen = strlen(linkpath);
- treenamelen = strnlen(treename, MAX_TREE_SIZE + 1);
- dfspath = kzalloc(treenamelen + linkpathlen + 1, GFP_KERNEL);
- if (!dfspath)
- return -ENOMEM;
- if (treenamelen)
- memcpy(dfspath, treename, treenamelen);
- memcpy(dfspath + treenamelen, linkpath, linkpathlen);
- rc = dfs_cache_find(xid, tcon->ses, cifs_sb->local_nls,
- cifs_remap(cifs_sb), dfspath, NULL, NULL);
- if (rc == 0) {
- cifs_dbg(FYI, "DFS ref '%s' is found, emulate -EREMOTE\n",
- dfspath);
- rc = -EREMOTE;
- } else {
- cifs_dbg(FYI, "%s: dfs_cache_find returned %d\n", __func__, rc);
- }
- kfree(dfspath);
- return rc;
-}
#endif
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 9e7d9f0baa18..c47b254f0d1e 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -292,9 +292,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
continue;
}
kref_get(&iface->refcount);
+ break;
}
- if (!list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
rc = 1;
iface = NULL;
cifs_dbg(FYI, "unable to find a suitable iface\n");
@@ -814,6 +815,7 @@ int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
return -EINVAL;
}
if (tilen) {
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = kmemdup(bcc_ptr + tioffset, tilen,
GFP_KERNEL);
if (!ses->auth_key.response) {
@@ -1427,6 +1429,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
goto out_put_spnego_key;
}
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
GFP_KERNEL);
if (!ses->auth_key.response) {
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 50480751e521..4cb364454e13 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -562,17 +562,20 @@ static int cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls,
cifs_remap(cifs_sb));
- if (!rc)
- move_cifs_info_to_smb2(&data->fi, &fi);
*adjustTZ = true;
}
- if (!rc && (le32_to_cpu(fi.Attributes) & ATTR_REPARSE)) {
+ if (!rc) {
int tmprc;
int oplock = 0;
struct cifs_fid fid;
struct cifs_open_parms oparms;
+ move_cifs_info_to_smb2(&data->fi, &fi);
+
+ if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE))
+ return 0;
+
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = FILE_READ_ATTRIBUTES;
@@ -716,17 +719,25 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
static int cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock,
void *buf)
{
- FILE_ALL_INFO *fi = buf;
+ struct cifs_open_info_data *data = buf;
+ FILE_ALL_INFO fi = {};
+ int rc;
if (!(oparms->tcon->ses->capabilities & CAP_NT_SMBS))
- return SMBLegacyOpen(xid, oparms->tcon, oparms->path,
- oparms->disposition,
- oparms->desired_access,
- oparms->create_options,
- &oparms->fid->netfid, oplock, fi,
- oparms->cifs_sb->local_nls,
- cifs_remap(oparms->cifs_sb));
- return CIFS_open(xid, oparms, oplock, fi);
+ rc = SMBLegacyOpen(xid, oparms->tcon, oparms->path,
+ oparms->disposition,
+ oparms->desired_access,
+ oparms->create_options,
+ &oparms->fid->netfid, oplock, &fi,
+ oparms->cifs_sb->local_nls,
+ cifs_remap(oparms->cifs_sb));
+ else
+ rc = CIFS_open(xid, oparms, oplock, &fi);
+
+ if (!rc && data)
+ move_cifs_info_to_smb2(&data->fi, &fi);
+
+ return rc;
}
static void
@@ -1050,7 +1061,7 @@ cifs_make_node(unsigned int xid, struct inode *inode,
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct inode *newinode = NULL;
int rc = -EPERM;
- FILE_ALL_INFO *buf = NULL;
+ struct cifs_open_info_data buf = {};
struct cifs_io_parms io_parms;
__u32 oplock = 0;
struct cifs_fid fid;
@@ -1082,14 +1093,14 @@ cifs_make_node(unsigned int xid, struct inode *inode,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
if (rc)
- goto out;
+ return rc;
rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
if (rc == 0)
d_instantiate(dentry, newinode);
- goto out;
+ return rc;
}
/*
@@ -1097,19 +1108,13 @@ cifs_make_node(unsigned int xid, struct inode *inode,
* support block and char device (no socket & fifo)
*/
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- goto out;
+ return rc;
if (!S_ISCHR(mode) && !S_ISBLK(mode))
- goto out;
+ return rc;
cifs_dbg(FYI, "sfu compat create special file\n");
- buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto out;
- }
-
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_WRITE;
@@ -1124,21 +1129,21 @@ cifs_make_node(unsigned int xid, struct inode *inode,
oplock = REQ_OPLOCK;
else
oplock = 0;
- rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &buf);
if (rc)
- goto out;
+ return rc;
/*
* BB Do not bother to decode buf since no local inode yet to put
* timestamps in, but we can reuse it safely.
*/
- pdev = (struct win_dev *)buf;
+ pdev = (struct win_dev *)&buf.fi;
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
io_parms.offset = 0;
io_parms.length = sizeof(struct win_dev);
- iov[1].iov_base = buf;
+ iov[1].iov_base = &buf.fi;
iov[1].iov_len = sizeof(struct win_dev);
if (S_ISCHR(mode)) {
memcpy(pdev->type, "IntxCHR", 8);
@@ -1157,8 +1162,8 @@ cifs_make_node(unsigned int xid, struct inode *inode,
d_drop(dentry);
/* FIXME: add code here to set EAs */
-out:
- kfree(buf);
+
+ cifs_free_open_info(&buf);
return rc;
}
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index ffbd9a99fc12..ba6cc50af390 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -122,8 +122,8 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
struct smb2_hdr *hdr = err_iov.iov_base;
if (unlikely(!err_iov.iov_base || err_buftype == CIFS_NO_BUFFER))
- rc = -ENOMEM;
- else if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
+ goto out;
+ if (hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
rc = smb2_parse_symlink_response(oparms->cifs_sb, &err_iov,
&data->symlink_target);
if (!rc) {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 68e08c85fbb8..8521adf9ce79 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -59,6 +59,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
__u32 desired_access, __u32 create_disposition, __u32 create_options,
umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile,
+ __u8 **extbuf, size_t *extbuflen,
struct kvec *err_iov, int *err_buftype)
{
struct cop_vars *vars = NULL;
@@ -430,6 +431,21 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
&rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */,
(char *)&idata->posix_fi);
}
+ if (rc == 0) {
+ unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength);
+
+ if (length > sizeof(idata->posix_fi)) {
+ char *base = (char *)rsp_iov[1].iov_base +
+ le16_to_cpu(qi_rsp->OutputBufferOffset) +
+ sizeof(idata->posix_fi);
+ *extbuflen = length - sizeof(idata->posix_fi);
+ *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL);
+ if (!*extbuf)
+ rc = -ENOMEM;
+ } else {
+ rc = -EINVAL;
+ }
+ }
if (rqst[1].rq_iov)
SMB2_query_info_free(&rqst[1]);
if (rqst[2].rq_iov)
@@ -539,23 +555,43 @@ int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile,
- err_iov, err_buftype);
- if (rc == -EOPNOTSUPP) {
- if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
- ((struct smb2_hdr *)err_iov[0].iov_base)->Command == SMB2_CREATE &&
- ((struct smb2_hdr *)err_iov[0].iov_base)->Status == STATUS_STOPPED_ON_SYMLINK) {
- rc = smb2_parse_symlink_response(cifs_sb, err_iov, &data->symlink_target);
+ NULL, NULL, err_iov, err_buftype);
+ if (rc) {
+ struct smb2_hdr *hdr = err_iov[0].iov_base;
+
+ if (unlikely(!hdr || err_buftype[0] == CIFS_NO_BUFFER))
+ goto out;
+ if (rc == -EOPNOTSUPP && hdr->Command == SMB2_CREATE &&
+ hdr->Status == STATUS_STOPPED_ON_SYMLINK) {
+ rc = smb2_parse_symlink_response(cifs_sb, err_iov,
+ &data->symlink_target);
if (rc)
goto out;
- }
- *reparse = true;
- create_options |= OPEN_REPARSE_POINT;
- /* Failed on a symbolic link - query a reparse point info */
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
- FILE_OPEN, create_options, ACL_NO_MODE, data,
- SMB2_OP_QUERY_INFO, cfile, NULL, NULL);
+ *reparse = true;
+ create_options |= OPEN_REPARSE_POINT;
+
+ /* Failed on a symbolic link - query a reparse point info */
+ cifs_get_readable_path(tcon, full_path, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, data,
+ SMB2_OP_QUERY_INFO, cfile, NULL, NULL,
+ NULL, NULL);
+ goto out;
+ } else if (rc != -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) &&
+ hdr->Status == STATUS_OBJECT_NAME_INVALID) {
+ /*
+ * Handle weird Windows SMB server behaviour. It responds with
+ * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
+ * for "\<server>\<dfsname>\<linkpath>" DFS reference,
+ * where <dfsname> contains non-ASCII unicode symbols.
+ */
+ rc = -EREMOTE;
+ }
+ if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS))
+ rc = -EOPNOTSUPP;
}
out:
@@ -568,13 +604,20 @@ out:
int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse)
+ struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
+ bool *adjust_tz, bool *reparse)
{
int rc;
__u32 create_options = 0;
struct cifsFileInfo *cfile;
struct kvec err_iov[3] = {};
int err_buftype[3] = {};
+ __u8 *sidsbuf = NULL;
+ __u8 *sidsbuf_end = NULL;
+ size_t sidsbuflen = 0;
+ size_t owner_len, group_len;
*adjust_tz = false;
*reparse = false;
@@ -589,7 +632,7 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
- err_iov, err_buftype);
+ &sidsbuf, &sidsbuflen, err_iov, err_buftype);
if (rc == -EOPNOTSUPP) {
/* BB TODO: When support for special files added to Samba re-verify this path */
if (err_iov[0].iov_base && err_buftype[0] != CIFS_NO_BUFFER &&
@@ -606,10 +649,31 @@ int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
FILE_OPEN, create_options, ACL_NO_MODE, data,
- SMB2_OP_POSIX_QUERY_INFO, cfile, NULL, NULL);
+ SMB2_OP_POSIX_QUERY_INFO, cfile,
+ &sidsbuf, &sidsbuflen, NULL, NULL);
+ }
+
+ if (rc == 0) {
+ sidsbuf_end = sidsbuf + sidsbuflen;
+
+ owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
+ if (owner_len == -1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ memcpy(owner, sidsbuf, owner_len);
+
+ group_len = posix_info_sid_size(
+ sidsbuf + owner_len, sidsbuf_end);
+ if (group_len == -1) {
+ rc = -EINVAL;
+ goto out;
+ }
+ memcpy(group, sidsbuf + owner_len, group_len);
}
out:
+ kfree(sidsbuf);
free_rsp_buf(err_buftype[0], err_iov[0].iov_base);
free_rsp_buf(err_buftype[1], err_iov[1].iov_base);
free_rsp_buf(err_buftype[2], err_iov[2].iov_base);
@@ -624,7 +688,7 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, NULL, NULL);
}
void
@@ -646,7 +710,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
CREATE_NOT_FILE, ACL_NO_MODE,
- &data, SMB2_OP_SET_INFO, cfile, NULL, NULL);
+ &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -658,7 +722,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_NOT_FILE, ACL_NO_MODE,
- NULL, SMB2_OP_RMDIR, NULL, NULL, NULL);
+ NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL);
}
int
@@ -667,7 +731,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL);
+ ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL);
}
static int
@@ -686,7 +750,7 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
}
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
- command, cfile, NULL, NULL);
+ command, cfile, NULL, NULL, NULL, NULL);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
@@ -727,7 +791,7 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
return smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
- &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL);
+ &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL);
}
int
@@ -754,7 +818,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile,
- NULL, NULL);
+ NULL, NULL, NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 32b3877b538a..e6bcd2baf446 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -530,7 +530,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
p = buf;
spin_lock(&ses->iface_lock);
- ses->iface_count = 0;
/*
* Go through iface_list and do kref_put to remove
* any unused ifaces. ifaces in use will be removed
@@ -540,6 +539,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
iface_head) {
iface->is_active = 0;
kref_put(&iface->refcount, release_iface);
+ ses->iface_count--;
}
spin_unlock(&ses->iface_lock);
@@ -618,6 +618,7 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
/* just get a ref so that it doesn't get picked/freed */
iface->is_active = 1;
kref_get(&iface->refcount);
+ ses->iface_count++;
spin_unlock(&ses->iface_lock);
goto next_iface;
} else if (ret < 0) {
@@ -796,7 +797,9 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
__le16 *utf16_path;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ int err_buftype = CIFS_NO_BUFFER;
struct cifs_open_parms oparms;
+ struct kvec err_iov = {};
struct cifs_fid fid;
struct cached_fid *cfid;
@@ -820,14 +823,32 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
oparms.fid = &fid;
oparms.reconnect = false;
- rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL,
- NULL);
+ rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
+ &err_iov, &err_buftype);
if (rc) {
- kfree(utf16_path);
- return rc;
+ struct smb2_hdr *hdr = err_iov.iov_base;
+
+ if (unlikely(!hdr || err_buftype == CIFS_NO_BUFFER))
+ goto out;
+ /*
+ * Handle weird Windows SMB server behaviour. It responds with
+ * STATUS_OBJECT_NAME_INVALID code to SMB2 QUERY_INFO request
+ * for "\<server>\<dfsname>\<linkpath>" DFS reference,
+ * where <dfsname> contains non-ASCII unicode symbols.
+ */
+ if (rc != -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) &&
+ hdr->Status == STATUS_OBJECT_NAME_INVALID)
+ rc = -EREMOTE;
+ if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS))
+ rc = -EOPNOTSUPP;
+ goto out;
}
rc = SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+
+out:
+ free_rsp_buf(err_buftype, err_iov.iov_base);
kfree(utf16_path);
return rc;
}
@@ -4204,69 +4225,82 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
}
-/* We can not use the normal sg_set_buf() as we will sometimes pass a
- * stack object as buf.
- */
-static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
- unsigned int buflen)
+static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+ int num_rqst, const u8 *sig, u8 **iv,
+ struct aead_request **req, struct scatterlist **sgl,
+ unsigned int *num_sgs)
{
- void *addr;
- /*
- * VMAP_STACK (at least) puts stack into the vmalloc address space
- */
- if (is_vmalloc_addr(buf))
- addr = vmalloc_to_page(buf);
- else
- addr = virt_to_page(buf);
- sg_set_page(sg, addr, buflen, offset_in_page(buf));
+ unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
+ unsigned int iv_size = crypto_aead_ivsize(tfm);
+ unsigned int len;
+ u8 *p;
+
+ *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig);
+
+ len = iv_size;
+ len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ len += req_size;
+ len = ALIGN(len, __alignof__(struct scatterlist));
+ len += *num_sgs * sizeof(**sgl);
+
+ p = kmalloc(len, GFP_ATOMIC);
+ if (!p)
+ return NULL;
+
+ *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1);
+ *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size,
+ crypto_tfm_ctx_alignment());
+ *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
+ __alignof__(struct scatterlist));
+ return p;
}
-/* Assumes the first rqst has a transform header as the first iov.
- * I.e.
- * rqst[0].rq_iov[0] is transform header
- * rqst[0].rq_iov[1+] data to be encrypted/decrypted
- * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
- */
-static struct scatterlist *
-init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign)
+static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+ int num_rqst, const u8 *sig, u8 **iv,
+ struct aead_request **req, struct scatterlist **sgl)
{
- unsigned int sg_len;
+ unsigned int off, len, skip;
struct scatterlist *sg;
- unsigned int i;
- unsigned int j;
- unsigned int idx = 0;
- int skip;
-
- sg_len = 1;
- for (i = 0; i < num_rqst; i++)
- sg_len += rqst[i].rq_nvec + rqst[i].rq_npages;
+ unsigned int num_sgs;
+ unsigned long addr;
+ int i, j;
+ void *p;
- sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL);
- if (!sg)
+ p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs);
+ if (!p)
return NULL;
- sg_init_table(sg, sg_len);
+ sg_init_table(*sgl, num_sgs);
+ sg = *sgl;
+
+ /* Assumes the first rqst has a transform header as the first iov.
+ * I.e.
+ * rqst[0].rq_iov[0] is transform header
+ * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+ * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ */
for (i = 0; i < num_rqst; i++) {
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
+ */
for (j = 0; j < rqst[i].rq_nvec; j++) {
- /*
- * The first rqst has a transform header where the
- * first 20 bytes are not part of the encrypted blob
- */
- skip = (i == 0) && (j == 0) ? 20 : 0;
- smb2_sg_set_buf(&sg[idx++],
- rqst[i].rq_iov[j].iov_base + skip,
- rqst[i].rq_iov[j].iov_len - skip);
- }
+ struct kvec *iov = &rqst[i].rq_iov[j];
+ skip = (i == 0) && (j == 0) ? 20 : 0;
+ addr = (unsigned long)iov->iov_base + skip;
+ len = iov->iov_len - skip;
+ sg = cifs_sg_set_buf(sg, (void *)addr, len);
+ }
for (j = 0; j < rqst[i].rq_npages; j++) {
- unsigned int len, offset;
-
- rqst_page_get_length(&rqst[i], j, &len, &offset);
- sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset);
+ rqst_page_get_length(&rqst[i], j, &len, &off);
+ sg_set_page(sg++, rqst[i].rq_pages[j], len, off);
}
}
- smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE);
- return sg;
+ cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE);
+
+ return p;
}
static int
@@ -4314,11 +4348,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 sign[SMB2_SIGNATURE_SIZE] = {};
u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
- char *iv;
- unsigned int iv_len;
+ u8 *iv;
DECLARE_CRYPTO_WAIT(wait);
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+ void *creq;
rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key);
if (rc) {
@@ -4352,32 +4386,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- req = aead_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__);
+ creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
+ if (unlikely(!creq))
return -ENOMEM;
- }
if (!enc) {
memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
crypt_len += SMB2_SIGNATURE_SIZE;
}
- sg = init_sg(num_rqst, rqst, sign);
- if (!sg) {
- cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__);
- rc = -ENOMEM;
- goto free_req;
- }
-
- iv_len = crypto_aead_ivsize(tfm);
- iv = kzalloc(iv_len, GFP_KERNEL);
- if (!iv) {
- cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__);
- rc = -ENOMEM;
- goto free_sg;
- }
-
if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) ||
(server->cipher_type == SMB2_ENCRYPTION_AES256_GCM))
memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE);
@@ -4386,6 +4403,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE);
}
+ aead_request_set_tfm(req, tfm);
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
@@ -4398,11 +4416,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kfree_sensitive(iv);
-free_sg:
- kfree_sensitive(sg);
-free_req:
- kfree_sensitive(req);
+ kfree_sensitive(creq);
return rc;
}
@@ -4445,21 +4459,27 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
int rc = -ENOMEM;
for (i = 1; i < num_rqst; i++) {
- npages = old_rq[i - 1].rq_npages;
+ struct smb_rqst *old = &old_rq[i - 1];
+ struct smb_rqst *new = &new_rq[i];
+
+ orig_len += smb_rqst_len(server, old);
+ new->rq_iov = old->rq_iov;
+ new->rq_nvec = old->rq_nvec;
+
+ npages = old->rq_npages;
+ if (!npages)
+ continue;
+
pages = kmalloc_array(npages, sizeof(struct page *),
GFP_KERNEL);
if (!pages)
goto err_free;
- new_rq[i].rq_pages = pages;
- new_rq[i].rq_npages = npages;
- new_rq[i].rq_offset = old_rq[i - 1].rq_offset;
- new_rq[i].rq_pagesz = old_rq[i - 1].rq_pagesz;
- new_rq[i].rq_tailsz = old_rq[i - 1].rq_tailsz;
- new_rq[i].rq_iov = old_rq[i - 1].rq_iov;
- new_rq[i].rq_nvec = old_rq[i - 1].rq_nvec;
-
- orig_len += smb_rqst_len(server, &old_rq[i - 1]);
+ new->rq_pages = pages;
+ new->rq_npages = npages;
+ new->rq_offset = old->rq_offset;
+ new->rq_pagesz = old->rq_pagesz;
+ new->rq_tailsz = old->rq_tailsz;
for (j = 0; j < npages; j++) {
pages[j] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
@@ -4469,17 +4489,12 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
/* copy pages form the old */
for (j = 0; j < npages; j++) {
- char *dst, *src;
unsigned int offset, len;
- rqst_page_get_length(&new_rq[i], j, &len, &offset);
-
- dst = (char *) kmap(new_rq[i].rq_pages[j]) + offset;
- src = (char *) kmap(old_rq[i - 1].rq_pages[j]) + offset;
+ rqst_page_get_length(new, j, &len, &offset);
- memcpy(dst, src, len);
- kunmap(new_rq[i].rq_pages[j]);
- kunmap(old_rq[i - 1].rq_pages[j]);
+ memcpy_page(new->rq_pages[j], offset,
+ old->rq_pages[j], offset, len);
}
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index a5695748a89b..2c9ffa921e6f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -541,9 +541,10 @@ static void
assemble_neg_contexts(struct smb2_negotiate_req *req,
struct TCP_Server_Info *server, unsigned int *total_len)
{
- char *pneg_ctxt;
- char *hostname = NULL;
unsigned int ctxt_len, neg_context_count;
+ struct TCP_Server_Info *pserver;
+ char *pneg_ctxt;
+ char *hostname;
if (*total_len > 200) {
/* In case length corrupted don't want to overrun smb buffer */
@@ -574,8 +575,9 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
* secondary channels don't have the hostname field populated
* use the hostname field in the primary channel instead
*/
- hostname = CIFS_SERVER_IS_CHAN(server) ?
- server->primary_server->hostname : server->hostname;
+ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server;
+ cifs_server_lock(pserver);
+ hostname = pserver->hostname;
if (hostname && (hostname[0] != 0)) {
ctxt_len = build_netname_ctxt((struct smb2_netname_neg_context *)pneg_ctxt,
hostname);
@@ -584,6 +586,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
neg_context_count = 3;
} else
neg_context_count = 2;
+ cifs_server_unlock(pserver);
build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt);
*total_len += sizeof(struct smb2_posix_neg_context);
@@ -1450,6 +1453,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
/* keep session key if binding */
if (!is_binding) {
+ kfree_sensitive(ses->auth_key.response);
ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
GFP_KERNEL);
if (!ses->auth_key.response) {
@@ -1479,8 +1483,11 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
out_put_spnego_key:
key_invalidate(spnego_key);
key_put(spnego_key);
- if (rc)
+ if (rc) {
kfree_sensitive(ses->auth_key.response);
+ ses->auth_key.response = NULL;
+ ses->auth_key.len = 0;
+ }
out:
sess_data->result = rc;
sess_data->func = NULL;
@@ -4156,12 +4163,15 @@ smb2_readv_callback(struct mid_q_entry *mid)
(struct smb2_hdr *)rdata->iov[0].iov_base;
struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
- .rq_nvec = 1,
- .rq_pages = rdata->pages,
- .rq_offset = rdata->page_offset,
- .rq_npages = rdata->nr_pages,
- .rq_pagesz = rdata->pagesz,
- .rq_tailsz = rdata->tailsz };
+ .rq_nvec = 1, };
+
+ if (rdata->got_bytes) {
+ rqst.rq_pages = rdata->pages;
+ rqst.rq_offset = rdata->page_offset;
+ rqst.rq_npages = rdata->nr_pages;
+ rqst.rq_pagesz = rdata->pagesz;
+ rqst.rq_tailsz = rdata->tailsz;
+ }
WARN_ONCE(rdata->server != mid->server,
"rdata server %p != mid server %p",
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index be21b5d26f67..d5d7ffb7711c 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -277,7 +277,10 @@ extern int smb2_query_info_compound(const unsigned int xid,
/* query path info from the server using SMB311 POSIX extensions*/
int smb311_posix_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- struct cifs_open_info_data *data, bool *adjust_tz, bool *reparse);
+ struct cifs_open_info_data *data,
+ struct cifs_sid *owner,
+ struct cifs_sid *group,
+ bool *adjust_tz, bool *reparse);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
diff --git a/fs/coredump.c b/fs/coredump.c
index a4c30bb900fe..de78bde2991b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -68,7 +68,10 @@ struct core_name {
static int expand_corename(struct core_name *cn, int size)
{
- char *corename = krealloc(cn->corename, size, GFP_KERNEL);
+ char *corename;
+
+ size = kmalloc_size_roundup(size);
+ corename = krealloc(cn->corename, size, GFP_KERNEL);
if (!corename)
return -ENOMEM;
@@ -76,7 +79,7 @@ static int expand_corename(struct core_name *cn, int size)
if (size > core_name_size) /* racy but harmless */
core_name_size = size;
- cn->size = ksize(corename);
+ cn->size = size;
cn->corename = corename;
return 0;
}
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 8b80ca0cd65f..4450721ec83c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -645,6 +645,7 @@ static void add_sock(struct socket *sock, struct connection *con)
if (dlm_config.ci_protocol == DLM_PROTO_SCTP)
sk->sk_state_change = lowcomms_state_change;
sk->sk_allocation = GFP_NOFS;
+ sk->sk_use_task_frag = false;
sk->sk_error_report = lowcomms_error_report;
release_sock(sk);
}
@@ -1769,6 +1770,7 @@ static int dlm_listen_for_all(void)
listen_con.sock = sock;
sock->sk->sk_allocation = GFP_NOFS;
+ sock->sk->sk_use_task_frag = false;
sock->sk->sk_data_ready = lowcomms_listen_data_ready;
release_sock(sock->sk);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 481788c24a68..626a615dafc2 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -577,26 +577,25 @@ static int erofs_fc_parse_param(struct fs_context *fc,
}
++ctx->devs->extra_devices;
break;
- case Opt_fsid:
#ifdef CONFIG_EROFS_FS_ONDEMAND
+ case Opt_fsid:
kfree(ctx->fsid);
ctx->fsid = kstrdup(param->string, GFP_KERNEL);
if (!ctx->fsid)
return -ENOMEM;
-#else
- errorfc(fc, "fsid option not supported");
-#endif
break;
case Opt_domain_id:
-#ifdef CONFIG_EROFS_FS_ONDEMAND
kfree(ctx->domain_id);
ctx->domain_id = kstrdup(param->string, GFP_KERNEL);
if (!ctx->domain_id)
return -ENOMEM;
+ break;
#else
- errorfc(fc, "domain_id option not supported");
-#endif
+ case Opt_fsid:
+ case Opt_domain_id:
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
break;
+#endif
default:
return -ENOPARAM;
}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ccf7c55d477f..5200bb86e264 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1032,12 +1032,12 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
if (!be->decompressed_pages)
be->decompressed_pages =
- kvcalloc(be->nr_pages, sizeof(struct page *),
- GFP_KERNEL | __GFP_NOFAIL);
+ kcalloc(be->nr_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
if (!be->compressed_pages)
be->compressed_pages =
- kvcalloc(pclusterpages, sizeof(struct page *),
- GFP_KERNEL | __GFP_NOFAIL);
+ kcalloc(pclusterpages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
z_erofs_parse_out_bvecs(be);
err2 = z_erofs_parse_in_bvecs(be, &overlapped);
@@ -1085,7 +1085,7 @@ out:
}
if (be->compressed_pages < be->onstack_pages ||
be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
- kvfree(be->compressed_pages);
+ kfree(be->compressed_pages);
z_erofs_fill_other_copies(be, err);
for (i = 0; i < be->nr_pages; ++i) {
@@ -1104,7 +1104,7 @@ out:
}
if (be->decompressed_pages != be->onstack_pages)
- kvfree(be->decompressed_pages);
+ kfree(be->decompressed_pages);
pcl->length = 0;
pcl->partial = true;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 0150570c33aa..98fb90b9af71 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -793,12 +793,16 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
/*
- * No strict rule how to describe extents for post EOF, yet
- * we need do like below. Otherwise, iomap itself will get
+ * No strict rule on how to describe extents for post EOF, yet
+ * we need to do like below. Otherwise, iomap itself will get
* into an endless loop on post EOF.
+ *
+ * Calculate the effective offset by subtracting extent start
+ * (map.m_la) from the requested offset, and add it to length.
+ * (NB: offset >= map.m_la always)
*/
if (iomap->offset >= inode->i_size)
- iomap->length = length + map.m_la - offset;
+ iomap->length = length + offset - map.m_la;
}
iomap->flags = 0;
return 0;
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 0fc08fdcba73..1dfa67f307f1 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -33,10 +33,9 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, unsigned short *uniname)
{
int i;
- struct exfat_entry_set_cache *es;
+ struct exfat_entry_set_cache es;
- es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, p_dir, entry, ES_ALL_ENTRIES))
return;
/*
@@ -45,8 +44,8 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
* Third entry : first file-name entry
* So, the index of first file-name dentry should start from 2.
*/
- for (i = 2; i < es->num_entries; i++) {
- struct exfat_dentry *ep = exfat_get_dentry_cached(es, i);
+ for (i = ES_IDX_FIRST_FILENAME; i < es.num_entries; i++) {
+ struct exfat_dentry *ep = exfat_get_dentry_cached(&es, i);
/* end of name entry */
if (exfat_get_entry_type(ep) != TYPE_EXTEND)
@@ -56,13 +55,13 @@ static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
uniname += EXFAT_FILE_NAME_LEN;
}
- exfat_free_dentry_set(es, false);
+ exfat_put_dentry_set(&es, false);
}
/* read a directory entry from the opened directory */
static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_entry *dir_entry)
{
- int i, dentries_per_clu, dentries_per_clu_bits = 0, num_ext;
+ int i, dentries_per_clu, num_ext;
unsigned int type, clu_offset, max_dentries;
struct exfat_chain dir, clu;
struct exfat_uni_name uni_name;
@@ -84,11 +83,10 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
EXFAT_B_TO_CLU(i_size_read(inode), sbi), ei->flags);
dentries_per_clu = sbi->dentries_per_clu;
- dentries_per_clu_bits = ilog2(dentries_per_clu);
max_dentries = (unsigned int)min_t(u64, MAX_EXFAT_DENTRIES,
- (u64)sbi->num_clusters << dentries_per_clu_bits);
+ (u64)EXFAT_CLU_TO_DEN(sbi->num_clusters, sbi));
- clu_offset = dentry >> dentries_per_clu_bits;
+ clu_offset = EXFAT_DEN_TO_CLU(dentry, sbi);
exfat_chain_dup(&clu, &dir);
if (clu.flags == ALLOC_NO_FAT_CHAIN) {
@@ -163,7 +161,7 @@ static int exfat_readdir(struct inode *inode, loff_t *cpos, struct exfat_dir_ent
dir_entry->entry = dentry;
brelse(bh);
- ei->hint_bmap.off = dentry >> dentries_per_clu_bits;
+ ei->hint_bmap.off = EXFAT_DEN_TO_CLU(dentry, sbi);
ei->hint_bmap.clu = clu.dir;
*cpos = EXFAT_DEN_TO_B(dentry + 1 + num_ext);
@@ -337,7 +335,7 @@ int exfat_calc_num_entries(struct exfat_uni_name *p_uniname)
return -EINVAL;
/* 1 file entry + 1 stream entry + name entries */
- return ((len - 1) / EXFAT_FILE_NAME_LEN + 3);
+ return ES_ENTRY_NUM(len);
}
unsigned int exfat_get_entry_type(struct exfat_dentry *ep)
@@ -592,18 +590,18 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
unsigned short chksum = 0;
struct exfat_dentry *ep;
- for (i = 0; i < es->num_entries; i++) {
+ for (i = ES_IDX_FILE; i < es->num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
chksum_type);
chksum_type = CS_DEFAULT;
}
- ep = exfat_get_dentry_cached(es, 0);
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
ep->dentry.file.checksum = cpu_to_le16(chksum);
es->modified = true;
}
-int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
+int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync)
{
int i, err = 0;
@@ -615,7 +613,10 @@ int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync)
bforget(es->bh[i]);
else
brelse(es->bh[i]);
- kfree(es);
+
+ if (IS_DYNAMIC_ES(es))
+ kfree(es->bh);
+
return err;
}
@@ -812,14 +813,14 @@ struct exfat_dentry *exfat_get_dentry_cached(
* pointer of entry set on success,
* NULL on failure.
*/
-struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, unsigned int type)
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir, int entry,
+ unsigned int type)
{
int ret, i, num_bh;
- unsigned int off, byte_offset, clu = 0;
+ unsigned int off;
sector_t sec;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct exfat_entry_set_cache *es;
struct exfat_dentry *ep;
int num_entries;
enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
@@ -827,52 +828,51 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
if (p_dir->dir == DIR_DELETED) {
exfat_err(sb, "access to deleted dentry");
- return NULL;
+ return -EIO;
}
- byte_offset = EXFAT_DEN_TO_B(entry);
- ret = exfat_walk_fat_chain(sb, p_dir, byte_offset, &clu);
+ ret = exfat_find_location(sb, p_dir, entry, &sec, &off);
if (ret)
- return NULL;
+ return ret;
- es = kzalloc(sizeof(*es), GFP_KERNEL);
- if (!es)
- return NULL;
+ memset(es, 0, sizeof(*es));
es->sb = sb;
es->modified = false;
-
- /* byte offset in cluster */
- byte_offset = EXFAT_CLU_OFFSET(byte_offset, sbi);
-
- /* byte offset in sector */
- off = EXFAT_BLK_OFFSET(byte_offset, sb);
es->start_off = off;
-
- /* sector offset in cluster */
- sec = EXFAT_B_TO_BLK(byte_offset, sb);
- sec += exfat_cluster_to_sector(sbi, clu);
+ es->bh = es->__bh;
bh = sb_bread(sb, sec);
if (!bh)
- goto free_es;
+ return -EIO;
es->bh[es->num_bh++] = bh;
- ep = exfat_get_dentry_cached(es, 0);
+ ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto free_es;
+ goto put_es;
num_entries = type == ES_ALL_ENTRIES ?
ep->dentry.file.num_ext + 1 : type;
es->num_entries = num_entries;
num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
+ if (num_bh > ARRAY_SIZE(es->__bh)) {
+ es->bh = kmalloc_array(num_bh, sizeof(*es->bh), GFP_KERNEL);
+ if (!es->bh) {
+ brelse(bh);
+ return -ENOMEM;
+ }
+ es->bh[0] = bh;
+ }
+
for (i = 1; i < num_bh; i++) {
/* get the next sector */
if (exfat_is_last_sector_in_cluster(sbi, sec)) {
+ unsigned int clu = exfat_sector_to_cluster(sbi, sec);
+
if (p_dir->flags == ALLOC_NO_FAT_CHAIN)
clu++;
else if (exfat_get_next_cluster(sb, &clu))
- goto free_es;
+ goto put_es;
sec = exfat_cluster_to_sector(sbi, clu);
} else {
sec++;
@@ -880,21 +880,51 @@ struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
bh = sb_bread(sb, sec);
if (!bh)
- goto free_es;
+ goto put_es;
es->bh[es->num_bh++] = bh;
}
/* validate cached dentries */
- for (i = 1; i < num_entries; i++) {
+ for (i = ES_IDX_STREAM; i < num_entries; i++) {
ep = exfat_get_dentry_cached(es, i);
if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
- goto free_es;
+ goto put_es;
}
- return es;
+ return 0;
+
+put_es:
+ exfat_put_dentry_set(es, false);
+ return -EIO;
+}
-free_es:
- exfat_free_dentry_set(es, false);
- return NULL;
+static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
+{
+ hint_femp->eidx = EXFAT_HINT_NONE;
+ hint_femp->count = 0;
+}
+
+static inline void exfat_set_empty_hint(struct exfat_inode_info *ei,
+ struct exfat_hint_femp *candi_empty, struct exfat_chain *clu,
+ int dentry, int num_entries, int entry_type)
+{
+ if (ei->hint_femp.eidx == EXFAT_HINT_NONE ||
+ ei->hint_femp.eidx > dentry) {
+ int total_entries = EXFAT_B_TO_DEN(i_size_read(&ei->vfs_inode));
+
+ if (candi_empty->count == 0) {
+ candi_empty->cur = *clu;
+ candi_empty->eidx = dentry;
+ }
+
+ if (entry_type == TYPE_UNUSED)
+ candi_empty->count += total_entries - dentry;
+ else
+ candi_empty->count++;
+
+ if (candi_empty->count == num_entries ||
+ candi_empty->count + candi_empty->eidx == total_entries)
+ ei->hint_femp = *candi_empty;
+ }
}
enum {
@@ -917,17 +947,21 @@ enum {
*/
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
- int num_entries, unsigned int type, struct exfat_hint *hint_opt)
+ struct exfat_hint *hint_opt)
{
int i, rewind = 0, dentry = 0, end_eidx = 0, num_ext = 0, len;
int order, step, name_len = 0;
- int dentries_per_clu, num_empty = 0;
+ int dentries_per_clu;
unsigned int entry_type;
unsigned short *uniname = NULL;
struct exfat_chain clu;
struct exfat_hint *hint_stat = &ei->hint_stat;
struct exfat_hint_femp candi_empty;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ int num_entries = exfat_calc_num_entries(p_uniname);
+
+ if (num_entries < 0)
+ return num_entries;
dentries_per_clu = sbi->dentries_per_clu;
@@ -939,10 +973,13 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
end_eidx = dentry;
}
- candi_empty.eidx = EXFAT_HINT_NONE;
+ exfat_reset_empty_hint(&ei->hint_femp);
+
rewind:
order = 0;
step = DIRENT_STEP_FILE;
+ exfat_reset_empty_hint(&candi_empty);
+
while (clu.dir != EXFAT_EOF_CLUSTER) {
i = dentry & (dentries_per_clu - 1);
for (; i < dentries_per_clu; i++, dentry++) {
@@ -962,26 +999,9 @@ rewind:
entry_type == TYPE_DELETED) {
step = DIRENT_STEP_FILE;
- num_empty++;
- if (candi_empty.eidx == EXFAT_HINT_NONE &&
- num_empty == 1) {
- exfat_chain_set(&candi_empty.cur,
- clu.dir, clu.size, clu.flags);
- }
-
- if (candi_empty.eidx == EXFAT_HINT_NONE &&
- num_empty >= num_entries) {
- candi_empty.eidx =
- dentry - (num_empty - 1);
- WARN_ON(candi_empty.eidx < 0);
- candi_empty.count = num_empty;
-
- if (ei->hint_femp.eidx ==
- EXFAT_HINT_NONE ||
- candi_empty.eidx <=
- ei->hint_femp.eidx)
- ei->hint_femp = candi_empty;
- }
+ exfat_set_empty_hint(ei, &candi_empty, &clu,
+ dentry, num_entries,
+ entry_type);
brelse(bh);
if (entry_type == TYPE_UNUSED)
@@ -989,17 +1009,14 @@ rewind:
continue;
}
- num_empty = 0;
- candi_empty.eidx = EXFAT_HINT_NONE;
+ exfat_reset_empty_hint(&candi_empty);
if (entry_type == TYPE_FILE || entry_type == TYPE_DIR) {
step = DIRENT_STEP_FILE;
hint_opt->clu = clu.dir;
hint_opt->eidx = i;
- if (type == TYPE_ALL || type == entry_type) {
- num_ext = ep->dentry.file.num_ext;
- step = DIRENT_STEP_STRM;
- }
+ num_ext = ep->dentry.file.num_ext;
+ step = DIRENT_STEP_STRM;
brelse(bh);
continue;
}
@@ -1090,12 +1107,19 @@ not_found:
rewind = 1;
dentry = 0;
clu.dir = p_dir->dir;
- /* reset empty hint */
- num_empty = 0;
- candi_empty.eidx = EXFAT_HINT_NONE;
goto rewind;
}
+ /*
+ * set the EXFAT_EOF_CLUSTER flag to avoid search
+ * from the beginning again when allocated a new cluster
+ */
+ if (ei->hint_femp.eidx == EXFAT_HINT_NONE) {
+ ei->hint_femp.cur.dir = EXFAT_EOF_CLUSTER;
+ ei->hint_femp.eidx = p_dir->size * dentries_per_clu;
+ ei->hint_femp.count = 0;
+ }
+
/* initialized hint_stat */
hint_stat->clu = p_dir->dir;
hint_stat->eidx = 0;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index a8f8eee4937c..bc6d21d7c5ad 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -9,6 +9,7 @@
#include <linux/fs.h>
#include <linux/ratelimit.h>
#include <linux/nls.h>
+#include <linux/blkdev.h>
#define EXFAT_ROOT_INO 1
@@ -41,6 +42,14 @@ enum {
#define ES_2_ENTRIES 2
#define ES_ALL_ENTRIES 0
+#define ES_IDX_FILE 0
+#define ES_IDX_STREAM 1
+#define ES_IDX_FIRST_FILENAME 2
+#define EXFAT_FILENAME_ENTRY_NUM(name_len) \
+ DIV_ROUND_UP(name_len, EXFAT_FILE_NAME_LEN)
+#define ES_IDX_LAST_FILENAME(name_len) \
+ (ES_IDX_FIRST_FILENAME + EXFAT_FILENAME_ENTRY_NUM(name_len) - 1)
+
#define DIR_DELETED 0xFFFF0321
/* type values */
@@ -62,15 +71,11 @@ enum {
#define TYPE_PADDING 0x0402
#define TYPE_ACLTAB 0x0403
#define TYPE_BENIGN_SEC 0x0800
-#define TYPE_ALL 0x0FFF
#define MAX_CHARSET_SIZE 6 /* max size of multi-byte character */
#define MAX_NAME_LENGTH 255 /* max len of file name excluding NULL */
#define MAX_VFSNAME_BUF_SIZE ((MAX_NAME_LENGTH + 1) * MAX_CHARSET_SIZE)
-/* Enough size to hold 256 dentry (even 512 Byte sector) */
-#define DIR_CACHE_SIZE (256*sizeof(struct exfat_dentry)/512+1)
-
#define EXFAT_HINT_NONE -1
#define EXFAT_MIN_SUBDIR 2
@@ -95,12 +100,18 @@ enum {
/*
* helpers for block size to dentry size conversion.
*/
-#define EXFAT_B_TO_DEN_IDX(b, sbi) \
- ((b) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
#define EXFAT_B_TO_DEN(b) ((b) >> DENTRY_SIZE_BITS)
#define EXFAT_DEN_TO_B(b) ((b) << DENTRY_SIZE_BITS)
/*
+ * helpers for cluster size to dentry size conversion.
+ */
+#define EXFAT_CLU_TO_DEN(clu, sbi) \
+ ((clu) << ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+#define EXFAT_DEN_TO_CLU(dentry, sbi) \
+ ((dentry) >> ((sbi)->cluster_size_bits - DENTRY_SIZE_BITS))
+
+/*
* helpers for fat entry.
*/
#define FAT_ENT_SIZE (4)
@@ -125,6 +136,17 @@ enum {
#define BITS_PER_BYTE_MASK 0x7
#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1)
+#define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1)
+/* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */
+#define ES_MAX_ENTRY_NUM ES_ENTRY_NUM(MAX_NAME_LENGTH)
+
+/*
+ * 19 entries x 32 bytes/entry = 608 bytes.
+ * The 608 bytes are in 3 sectors at most (even 512 Byte sector).
+ */
+#define DIR_CACHE_SIZE \
+ (DIV_ROUND_UP(EXFAT_DEN_TO_B(ES_MAX_ENTRY_NUM), SECTOR_SIZE) + 1)
+
struct exfat_dentry_namebuf {
char *lfn;
int lfnbuf_len; /* usually MAX_UNINAME_BUF_SIZE */
@@ -166,13 +188,16 @@ struct exfat_hint {
struct exfat_entry_set_cache {
struct super_block *sb;
- bool modified;
unsigned int start_off;
int num_bh;
- struct buffer_head *bh[DIR_CACHE_SIZE];
+ struct buffer_head *__bh[DIR_CACHE_SIZE];
+ struct buffer_head **bh;
unsigned int num_entries;
+ bool modified;
};
+#define IS_DYNAMIC_ES(es) ((es)->__bh != (es)->bh)
+
struct exfat_dir_entry {
struct exfat_chain dir;
int entry;
@@ -375,7 +400,7 @@ static inline sector_t exfat_cluster_to_sector(struct exfat_sb_info *sbi,
sbi->data_start_sector;
}
-static inline int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
+static inline unsigned int exfat_sector_to_cluster(struct exfat_sb_info *sbi,
sector_t sec)
{
return ((sec - sbi->data_start_sector) >> sbi->sect_per_clus_bits) +
@@ -423,8 +448,8 @@ int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
/* file.c */
extern const struct file_operations exfat_file_operations;
-int __exfat_truncate(struct inode *inode, loff_t new_size);
-void exfat_truncate(struct inode *inode, loff_t size);
+int __exfat_truncate(struct inode *inode);
+void exfat_truncate(struct inode *inode);
int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr);
int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path,
@@ -464,15 +489,16 @@ void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es);
int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
- int num_entries, unsigned int type, struct exfat_hint *hint_opt);
+ struct exfat_hint *hint_opt);
int exfat_alloc_new_dir(struct inode *inode, struct exfat_chain *clu);
struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
struct exfat_chain *p_dir, int entry, struct buffer_head **bh);
struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
int num);
-struct exfat_entry_set_cache *exfat_get_dentry_set(struct super_block *sb,
- struct exfat_chain *p_dir, int entry, unsigned int type);
-int exfat_free_dentry_set(struct exfat_entry_set_cache *es, int sync);
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+ struct super_block *sb, struct exfat_chain *p_dir, int entry,
+ unsigned int type);
+int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
/* inode.c */
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 4e0793f35e8f..f5b29072775d 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -93,7 +93,7 @@ static int exfat_sanitize_mode(const struct exfat_sb_info *sbi,
}
/* resize the file length */
-int __exfat_truncate(struct inode *inode, loff_t new_size)
+int __exfat_truncate(struct inode *inode)
{
unsigned int num_clusters_new, num_clusters_phys;
unsigned int last_clu = EXFAT_FREE_CLUSTER;
@@ -113,7 +113,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
exfat_chain_set(&clu, ei->start_clu, num_clusters_phys, ei->flags);
- if (new_size > 0) {
+ if (i_size_read(inode) > 0) {
/*
* Truncate FAT chain num_clusters after the first cluster
* num_clusters = min(new, phys);
@@ -143,8 +143,6 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
ei->start_clu = EXFAT_EOF_CLUSTER;
}
- i_size_write(inode, new_size);
-
if (ei->type == TYPE_FILE)
ei->attr |= ATTR_ARCHIVE;
@@ -189,7 +187,7 @@ int __exfat_truncate(struct inode *inode, loff_t new_size)
return 0;
}
-void exfat_truncate(struct inode *inode, loff_t size)
+void exfat_truncate(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -207,7 +205,7 @@ void exfat_truncate(struct inode *inode, loff_t size)
goto write_size;
}
- err = __exfat_truncate(inode, i_size_read(inode));
+ err = __exfat_truncate(inode);
if (err)
goto write_size;
@@ -310,7 +308,7 @@ int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
* __exfat_write_inode() is called from exfat_truncate(), inode
* is already written by it, so mark_inode_dirty() is unneeded.
*/
- exfat_truncate(inode, attr->ia_size);
+ exfat_truncate(inode);
up_write(&EXFAT_I(inode)->truncate_lock);
} else
mark_inode_dirty(inode);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index eac95bcd9a8a..5b644cb057fa 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -21,7 +21,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
{
unsigned long long on_disk_size;
struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es = NULL;
+ struct exfat_entry_set_cache es;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
@@ -42,11 +42,10 @@ int __exfat_write_inode(struct inode *inode, int sync)
exfat_set_volume_dirty(sb);
/* get the directory entry of given file or directory */
- es = exfat_get_dentry_set(sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, &(ei->dir), ei->entry, ES_ALL_ENTRIES))
return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
+ ep = exfat_get_dentry_cached(&es, ES_IDX_FILE);
+ ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM);
ep->dentry.file.attr = cpu_to_le16(exfat_make_attr(inode));
@@ -83,8 +82,8 @@ int __exfat_write_inode(struct inode *inode, int sync)
ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
}
- exfat_update_dir_chksum_with_entry_set(es);
- return exfat_free_dentry_set(es, sync);
+ exfat_update_dir_chksum_with_entry_set(&es);
+ return exfat_put_dentry_set(&es, sync);
}
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -358,7 +357,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
if (to > i_size_read(inode)) {
truncate_pagecache(inode, i_size_read(inode));
inode->i_mtime = inode->i_ctime = current_time(inode);
- exfat_truncate(inode, EXFAT_I(inode)->i_size_aligned);
+ exfat_truncate(inode);
}
}
@@ -622,7 +621,7 @@ void exfat_evict_inode(struct inode *inode)
if (!inode->i_nlink) {
i_size_write(inode, 0);
mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
- __exfat_truncate(inode, 0);
+ __exfat_truncate(inode);
mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index b617bebc3d0f..5f995eba5dbb 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -224,11 +224,18 @@ static int exfat_search_empty_slot(struct super_block *sb,
if (hint_femp->eidx != EXFAT_HINT_NONE) {
dentry = hint_femp->eidx;
- if (num_entries <= hint_femp->count) {
- hint_femp->eidx = EXFAT_HINT_NONE;
- return dentry;
- }
+ /*
+ * If hint_femp->count is enough, it is needed to check if
+ * there are actual empty entries.
+ * Otherwise, and if "dentry + hint_famp->count" is also equal
+ * to "p_dir->size * dentries_per_clu", it means ENOSPC.
+ */
+ if (dentry + hint_femp->count == p_dir->size * dentries_per_clu &&
+ num_entries > hint_femp->count)
+ return -ENOSPC;
+
+ hint_femp->eidx = EXFAT_HINT_NONE;
exfat_chain_dup(&clu, &hint_femp->cur);
} else {
exfat_chain_dup(&clu, p_dir);
@@ -293,6 +300,12 @@ static int exfat_search_empty_slot(struct super_block *sb,
}
}
+ hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty;
+ hint_femp->count = num_empty;
+ if (num_empty == 0)
+ exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0,
+ clu.flags);
+
return -ENOSPC;
}
@@ -369,15 +382,11 @@ static int exfat_find_empty_entry(struct inode *inode,
if (exfat_ent_set(sb, last_clu, clu.dir))
return -EIO;
- if (hint_femp.eidx == EXFAT_HINT_NONE) {
- /* the special case that new dentry
- * should be allocated from the start of new cluster
- */
- hint_femp.eidx = EXFAT_B_TO_DEN_IDX(p_dir->size, sbi);
- hint_femp.count = sbi->dentries_per_clu;
-
+ if (hint_femp.cur.dir == EXFAT_EOF_CLUSTER)
exfat_chain_set(&hint_femp.cur, clu.dir, 0, clu.flags);
- }
+
+ hint_femp.count += sbi->dentries_per_clu;
+
hint_femp.cur.size++;
p_dir->size++;
size = EXFAT_CLU_TO_B(p_dir->size, sbi);
@@ -588,14 +597,14 @@ unlock:
static int exfat_find(struct inode *dir, struct qstr *qname,
struct exfat_dir_entry *info)
{
- int ret, dentry, num_entries, count;
+ int ret, dentry, count;
struct exfat_chain cdir;
struct exfat_uni_name uni_name;
struct super_block *sb = dir->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(dir);
struct exfat_dentry *ep, *ep2;
- struct exfat_entry_set_cache *es;
+ struct exfat_entry_set_cache es;
/* for optimized dir & entry to prevent long traverse of cluster chain */
struct exfat_hint hint_opt;
@@ -607,10 +616,6 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
if (ret)
return ret;
- num_entries = exfat_calc_num_entries(&uni_name);
- if (num_entries < 0)
- return num_entries;
-
/* check the validation of hint_stat and initialize it if required */
if (ei->version != (inode_peek_iversion_raw(dir) & 0xffffffff)) {
ei->hint_stat.clu = cdir.dir;
@@ -620,9 +625,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
}
/* search the file name for directories */
- dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name,
- num_entries, TYPE_ALL, &hint_opt);
-
+ dentry = exfat_find_dir_entry(sb, ei, &cdir, &uni_name, &hint_opt);
if (dentry < 0)
return dentry; /* -error value */
@@ -635,11 +638,10 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
if (cdir.flags & ALLOC_NO_FAT_CHAIN)
cdir.size -= dentry / sbi->dentries_per_clu;
dentry = hint_opt.eidx;
- es = exfat_get_dentry_set(sb, &cdir, dentry, ES_2_ENTRIES);
- if (!es)
+ if (exfat_get_dentry_set(&es, sb, &cdir, dentry, ES_2_ENTRIES))
return -EIO;
- ep = exfat_get_dentry_cached(es, 0);
- ep2 = exfat_get_dentry_cached(es, 1);
+ ep = exfat_get_dentry_cached(&es, ES_IDX_FILE);
+ ep2 = exfat_get_dentry_cached(&es, ES_IDX_STREAM);
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
@@ -668,7 +670,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
ep->dentry.file.access_time,
ep->dentry.file.access_date,
0);
- exfat_free_dentry_set(es, false);
+ exfat_put_dentry_set(&es, false);
if (ei->start_clu == EXFAT_FREE_CLUSTER) {
exfat_fs_error(sb,
@@ -1167,7 +1169,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
struct exfat_inode_info *new_ei = NULL;
unsigned int new_entry_type = TYPE_UNUSED;
int new_entry = 0;
- struct buffer_head *old_bh, *new_bh = NULL;
+ struct buffer_head *new_bh = NULL;
/* check the validity of pointer parameters */
if (new_path == NULL || strlen(new_path) == 0)
@@ -1183,13 +1185,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
EXFAT_I(old_parent_inode)->flags);
dentry = ei->entry;
- ep = exfat_get_dentry(sb, &olddir, dentry, &old_bh);
- if (!ep) {
- ret = -EIO;
- goto out;
- }
- brelse(old_bh);
-
/* check whether new dir is existing directory and empty */
if (new_inode) {
ret = -EIO;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 16a343e8047d..260c1b3e3ef2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1225,7 +1225,7 @@ static void ext4_put_super(struct super_block *sb)
}
ext4_es_unregister_shrinker(sbi);
- del_timer_sync(&sbi->s_err_report);
+ timer_shutdown_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7decaaf27e82..a2f04a3808db 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,6 +81,8 @@ ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *,
struct mb_cache_entry **);
static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
size_t value_count);
+static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value,
+ size_t value_count);
static void ext4_xattr_rehash(struct ext4_xattr_header *);
static const struct xattr_handler * const ext4_xattr_handler_map[] = {
@@ -470,8 +472,22 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode,
tmp_data = cpu_to_le32(hash);
e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len,
&tmp_data, 1);
+ /* All good? */
+ if (e_hash == entry->e_hash)
+ return 0;
+
+ /*
+ * Not good. Maybe the entry hash was calculated
+ * using the buggy signed char version?
+ */
+ e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len,
+ &tmp_data, 1);
+ /* Still no match - bad */
if (e_hash != entry->e_hash)
return -EFSCORRUPTED;
+
+ /* Let people know about old hash */
+ pr_warn_once("ext4: filesystem with signed xattr name hash");
}
return 0;
}
@@ -3081,7 +3097,29 @@ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value,
while (name_len--) {
hash = (hash << NAME_HASH_SHIFT) ^
(hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
- *name++;
+ (unsigned char)*name++;
+ }
+ while (value_count--) {
+ hash = (hash << VALUE_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
+ le32_to_cpu(*value++);
+ }
+ return cpu_to_le32(hash);
+}
+
+/*
+ * ext4_xattr_hash_entry_signed()
+ *
+ * Compute the hash of an extended attribute incorrectly.
+ */
+static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count)
+{
+ __u32 hash = 0;
+
+ while (name_len--) {
+ hash = (hash << NAME_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
+ (signed char)*name++;
}
while (value_count--) {
hash = (hash << VALUE_HASH_SHIFT) ^
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0c82dae082aa..56f7d0d6a8b2 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -171,6 +171,11 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type)
{
+ if (time_to_inject(sbi, FAULT_BLKADDR)) {
+ f2fs_show_injection_info(sbi, FAULT_BLKADDR);
+ return false;
+ }
+
switch (type) {
case META_NAT:
break;
@@ -1897,8 +1902,10 @@ int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi)
cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi,
"f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(cprc->f2fs_issue_ckpt)) {
+ int err = PTR_ERR(cprc->f2fs_issue_ckpt);
+
cprc->f2fs_issue_ckpt = NULL;
- return -ENOMEM;
+ return err;
}
set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 2b7a5cc4ed66..2532f369cb10 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -346,7 +346,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
if (!level)
level = F2FS_ZSTD_DEFAULT_CLEVEL;
- params = zstd_get_params(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen);
+ params = zstd_get_params(level, cc->rlen);
workspace_size = zstd_cstream_workspace_bound(&params.cParams);
workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
@@ -567,10 +567,7 @@ MODULE_PARM_DESC(num_compress_pages,
int f2fs_init_compress_mempool(void)
{
compress_page_pool = mempool_create_page_pool(num_compress_pages, 0);
- if (!compress_page_pool)
- return -ENOMEM;
-
- return 0;
+ return compress_page_pool ? 0 : -ENOMEM;
}
void f2fs_destroy_compress_mempool(void)
@@ -1981,9 +1978,7 @@ int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi)
sbi->page_array_slab = f2fs_kmem_cache_create(slab_name,
sbi->page_array_slab_size);
- if (!sbi->page_array_slab)
- return -ENOMEM;
- return 0;
+ return sbi->page_array_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
@@ -1991,53 +1986,24 @@ void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi)
kmem_cache_destroy(sbi->page_array_slab);
}
-static int __init f2fs_init_cic_cache(void)
+int __init f2fs_init_compress_cache(void)
{
cic_entry_slab = f2fs_kmem_cache_create("f2fs_cic_entry",
sizeof(struct compress_io_ctx));
if (!cic_entry_slab)
return -ENOMEM;
- return 0;
-}
-
-static void f2fs_destroy_cic_cache(void)
-{
- kmem_cache_destroy(cic_entry_slab);
-}
-
-static int __init f2fs_init_dic_cache(void)
-{
dic_entry_slab = f2fs_kmem_cache_create("f2fs_dic_entry",
sizeof(struct decompress_io_ctx));
if (!dic_entry_slab)
- return -ENOMEM;
- return 0;
-}
-
-static void f2fs_destroy_dic_cache(void)
-{
- kmem_cache_destroy(dic_entry_slab);
-}
-
-int __init f2fs_init_compress_cache(void)
-{
- int err;
-
- err = f2fs_init_cic_cache();
- if (err)
- goto out;
- err = f2fs_init_dic_cache();
- if (err)
goto free_cic;
return 0;
free_cic:
- f2fs_destroy_cic_cache();
-out:
+ kmem_cache_destroy(cic_entry_slab);
return -ENOMEM;
}
void f2fs_destroy_compress_cache(void)
{
- f2fs_destroy_dic_cache();
- f2fs_destroy_cic_cache();
+ kmem_cache_destroy(dic_entry_slab);
+ kmem_cache_destroy(cic_entry_slab);
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7af75041bd81..97e816590cd9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -39,10 +39,8 @@ static struct bio_set f2fs_bioset;
int __init f2fs_init_bioset(void)
{
- if (bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
- 0, BIOSET_NEED_BVECS))
- return -ENOMEM;
- return 0;
+ return bioset_init(&f2fs_bioset, F2FS_BIO_POOL_SIZE,
+ 0, BIOSET_NEED_BVECS);
}
void f2fs_destroy_bioset(void)
@@ -1145,7 +1143,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
dn->data_blkaddr = blkaddr;
f2fs_set_data_blkaddr(dn);
- f2fs_update_extent_cache(dn);
+ f2fs_update_read_extent_cache(dn);
}
/* dn->ofs_in_node will be returned with up-to-date last block pointer */
@@ -1214,7 +1212,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
struct extent_info ei = {0, };
struct inode *inode = dn->inode;
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn->data_blkaddr = ei.blk + index - ei.fofs;
return 0;
}
@@ -1223,7 +1221,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
}
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write)
+ blk_opf_t op_flags, bool for_write,
+ pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
@@ -1235,7 +1234,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
if (!page)
return ERR_PTR(-ENOMEM);
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ)) {
@@ -1249,12 +1248,17 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
- if (err)
+ if (err) {
+ if (err == -ENOENT && next_pgofs)
+ *next_pgofs = f2fs_get_next_page_offset(&dn, index);
goto put_err;
+ }
f2fs_put_dnode(&dn);
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
err = -ENOENT;
+ if (next_pgofs)
+ *next_pgofs = index + 1;
goto put_err;
}
if (dn.data_blkaddr != NEW_ADDR &&
@@ -1298,7 +1302,8 @@ put_err:
return ERR_PTR(err);
}
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+ pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -1308,7 +1313,7 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = f2fs_get_read_data_page(inode, index, 0, false);
+ page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs);
if (IS_ERR(page))
return page;
@@ -1334,7 +1339,7 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = f2fs_get_read_data_page(inode, index, 0, for_write);
+ page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL);
if (IS_ERR(page))
return page;
@@ -1497,7 +1502,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
pgofs = (pgoff_t)map->m_lblk;
end = pgofs + maxblocks;
- if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
+ if (!create && f2fs_lookup_read_extent_cache(inode, pgofs, &ei)) {
if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create)
goto next_dnode;
@@ -1707,7 +1712,7 @@ skip:
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
- f2fs_update_extent_cache_range(&dn,
+ f2fs_update_read_extent_cache_range(&dn,
start_pgofs, map->m_pblk + ofs,
map->m_len - ofs);
}
@@ -1752,7 +1757,7 @@ sync_out:
if (map->m_flags & F2FS_MAP_MAPPED) {
unsigned int ofs = start_pgofs - map->m_lblk;
- f2fs_update_extent_cache_range(&dn,
+ f2fs_update_read_extent_cache_range(&dn,
start_pgofs, map->m_pblk + ofs,
map->m_len - ofs);
}
@@ -2178,7 +2183,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
sector_t last_block_in_file;
const unsigned blocksize = blks_to_bytes(inode, 1);
struct decompress_io_ctx *dic = NULL;
- struct extent_info ei = {0, };
+ struct extent_info ei = {};
bool from_dnode = true;
int i;
int ret = 0;
@@ -2212,7 +2217,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
if (f2fs_cluster_is_empty(cc))
goto out;
- if (f2fs_lookup_extent_cache(inode, start_idx, &ei))
+ if (f2fs_lookup_read_extent_cache(inode, start_idx, &ei))
from_dnode = false;
if (!from_dnode)
@@ -2643,7 +2648,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_inplace_update(fio) &&
- f2fs_lookup_extent_cache(inode, page->index, &ei)) {
+ f2fs_lookup_read_extent_cache(inode, page->index, &ei)) {
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
@@ -3367,7 +3372,7 @@ restart:
} else if (locked) {
err = f2fs_get_block(&dn, index);
} else {
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
/* hole case */
@@ -3408,7 +3413,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
set_new_dnode(&dn, inode, ipage, ipage, 0);
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
} else {
/* hole case */
@@ -3472,6 +3477,9 @@ static int prepare_atomic_write_begin(struct f2fs_sb_info *sbi,
else if (*blk_addr != NULL_ADDR)
return 0;
+ if (is_inode_flag_set(inode, FI_ATOMIC_REPLACE))
+ goto reserve_block;
+
/* Look for the block in the original inode */
err = __find_data_block(inode, index, &ori_blk_addr);
if (err)
@@ -4093,9 +4101,7 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi)
sbi->post_read_wq = alloc_workqueue("f2fs_post_read_wq",
WQ_UNBOUND | WQ_HIGHPRI,
num_online_cpus());
- if (!sbi->post_read_wq)
- return -ENOMEM;
- return 0;
+ return sbi->post_read_wq ? 0 : -ENOMEM;
}
void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
@@ -4108,9 +4114,7 @@ int __init f2fs_init_bio_entry_cache(void)
{
bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
- if (!bio_entry_slab)
- return -ENOMEM;
- return 0;
+ return bio_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_bio_entry_cache(void)
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a216dcdf6941..32af4f0c5735 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -72,15 +72,26 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->main_area_zones = si->main_area_sections /
le32_to_cpu(raw_super->secs_per_zone);
- /* validation check of the segment numbers */
+ /* general extent cache stats */
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+ si->hit_cached[i] = atomic64_read(&sbi->read_hit_cached[i]);
+ si->hit_rbtree[i] = atomic64_read(&sbi->read_hit_rbtree[i]);
+ si->total_ext[i] = atomic64_read(&sbi->total_hit_ext[i]);
+ si->hit_total[i] = si->hit_cached[i] + si->hit_rbtree[i];
+ si->ext_tree[i] = atomic_read(&eti->total_ext_tree);
+ si->zombie_tree[i] = atomic_read(&eti->total_zombie_tree);
+ si->ext_node[i] = atomic_read(&eti->total_ext_node);
+ }
+ /* read extent_cache only */
si->hit_largest = atomic64_read(&sbi->read_hit_largest);
- si->hit_cached = atomic64_read(&sbi->read_hit_cached);
- si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
- si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
- si->total_ext = atomic64_read(&sbi->total_hit_ext);
- si->ext_tree = atomic_read(&sbi->total_ext_tree);
- si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
- si->ext_node = atomic_read(&sbi->total_ext_node);
+ si->hit_total[EX_READ] += si->hit_largest;
+
+ /* block age extent_cache only */
+ si->allocated_data_blocks = atomic64_read(&sbi->allocated_data_blocks);
+
+ /* validation check of the segment numbers */
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
@@ -294,25 +305,32 @@ get_cache:
sizeof(struct nat_entry_set);
for (i = 0; i < MAX_INO_ENTRY; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
- si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ struct extent_tree_info *eti = &sbi->extent_tree[i];
+
+ si->ext_mem[i] = atomic_read(&eti->total_ext_tree) *
sizeof(struct extent_tree);
- si->cache_mem += atomic_read(&sbi->total_ext_node) *
+ si->ext_mem[i] += atomic_read(&eti->total_ext_node) *
sizeof(struct extent_node);
+ si->cache_mem += si->ext_mem[i];
+ }
si->page_mem = 0;
if (sbi->node_inode) {
- unsigned npages = NODE_MAPPING(sbi)->nrpages;
+ unsigned long npages = NODE_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
if (sbi->meta_inode) {
- unsigned npages = META_MAPPING(sbi)->nrpages;
+ unsigned long npages = META_MAPPING(sbi)->nrpages;
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (sbi->compress_inode) {
- unsigned npages = COMPRESS_MAPPING(sbi)->nrpages;
+ unsigned long npages = COMPRESS_MAPPING(sbi)->nrpages;
+
si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
}
#endif
@@ -460,28 +478,28 @@ static int stat_show(struct seq_file *s, void *v)
si->meta_count[META_NAT]);
seq_printf(s, " - ssa blocks : %u\n",
si->meta_count[META_SSA]);
- seq_printf(s, "CP merge (Queued: %4d, Issued: %4d, Total: %4d, "
- "Cur time: %4d(ms), Peak time: %4d(ms))\n",
- si->nr_queued_ckpt, si->nr_issued_ckpt,
- si->nr_total_ckpt, si->cur_ckpt_time,
- si->peak_ckpt_time);
+ seq_puts(s, "CP merge:\n");
+ seq_printf(s, " - Queued : %4d\n", si->nr_queued_ckpt);
+ seq_printf(s, " - Issued : %4d\n", si->nr_issued_ckpt);
+ seq_printf(s, " - Total : %4d\n", si->nr_total_ckpt);
+ seq_printf(s, " - Cur time : %4d(ms)\n", si->cur_ckpt_time);
+ seq_printf(s, " - Peak time : %4d(ms)\n", si->peak_ckpt_time);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d (%d)\n",
si->data_segs, si->bg_data_segs);
seq_printf(s, " - node segments : %d (%d)\n",
si->node_segs, si->bg_node_segs);
- seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), "
- "Idle Greedy (%d), Idle AT (%d), "
- "Urgent High (%d), Urgent Mid (%d), "
- "Urgent Low (%d)\n",
- si->sbi->gc_reclaimed_segs[GC_NORMAL],
- si->sbi->gc_reclaimed_segs[GC_IDLE_CB],
- si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY],
- si->sbi->gc_reclaimed_segs[GC_IDLE_AT],
- si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH],
- si->sbi->gc_reclaimed_segs[GC_URGENT_MID],
- si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
+ seq_puts(s, " - Reclaimed segs :\n");
+ seq_printf(s, " - Normal : %d\n", si->sbi->gc_reclaimed_segs[GC_NORMAL]);
+ seq_printf(s, " - Idle CB : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_CB]);
+ seq_printf(s, " - Idle Greedy : %d\n",
+ si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY]);
+ seq_printf(s, " - Idle AT : %d\n", si->sbi->gc_reclaimed_segs[GC_IDLE_AT]);
+ seq_printf(s, " - Urgent High : %d\n",
+ si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH]);
+ seq_printf(s, " - Urgent Mid : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_MID]);
+ seq_printf(s, " - Urgent Low : %d\n", si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]);
seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
si->bg_data_blks + si->bg_node_blks);
seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
@@ -490,26 +508,44 @@ static int stat_show(struct seq_file *s, void *v)
si->bg_node_blks);
seq_printf(s, "BG skip : IO: %u, Other: %u\n",
si->io_skip_bggc, si->other_skip_bggc);
- seq_puts(s, "\nExtent Cache:\n");
+ seq_puts(s, "\nExtent Cache (Read):\n");
seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
- si->hit_largest, si->hit_cached,
- si->hit_rbtree);
+ si->hit_largest, si->hit_cached[EX_READ],
+ si->hit_rbtree[EX_READ]);
+ seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
+ !si->total_ext[EX_READ] ? 0 :
+ div64_u64(si->hit_total[EX_READ] * 100,
+ si->total_ext[EX_READ]),
+ si->hit_total[EX_READ], si->total_ext[EX_READ]);
+ seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
+ si->ext_tree[EX_READ], si->zombie_tree[EX_READ],
+ si->ext_node[EX_READ]);
+ seq_puts(s, "\nExtent Cache (Block Age):\n");
+ seq_printf(s, " - Allocated Data Blocks: %llu\n",
+ si->allocated_data_blocks);
+ seq_printf(s, " - Hit Count: L1:%llu L2:%llu\n",
+ si->hit_cached[EX_BLOCK_AGE],
+ si->hit_rbtree[EX_BLOCK_AGE]);
seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
- !si->total_ext ? 0 :
- div64_u64(si->hit_total * 100, si->total_ext),
- si->hit_total, si->total_ext);
+ !si->total_ext[EX_BLOCK_AGE] ? 0 :
+ div64_u64(si->hit_total[EX_BLOCK_AGE] * 100,
+ si->total_ext[EX_BLOCK_AGE]),
+ si->hit_total[EX_BLOCK_AGE],
+ si->total_ext[EX_BLOCK_AGE]);
seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n",
- si->ext_tree, si->zombie_tree, si->ext_node);
+ si->ext_tree[EX_BLOCK_AGE],
+ si->zombie_tree[EX_BLOCK_AGE],
+ si->ext_node[EX_BLOCK_AGE]);
seq_puts(s, "\nBalancing F2FS Async:\n");
seq_printf(s, " - DIO (R: %4d, W: %4d)\n",
si->nr_dio_read, si->nr_dio_write);
seq_printf(s, " - IO_R (Data: %4d, Node: %4d, Meta: %4d\n",
si->nr_rd_data, si->nr_rd_node, si->nr_rd_meta);
- seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
- "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
+ seq_printf(s, " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), ",
si->nr_wb_cp_data, si->nr_wb_data,
si->nr_flushing, si->nr_flushed,
- si->flush_list_empty,
+ si->flush_list_empty);
+ seq_printf(s, "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
si->nr_discarding, si->nr_discarded,
si->nr_discard_cmd, si->undiscard_blks);
seq_printf(s, " - atomic IO: %4d (Max. %4d)\n",
@@ -566,8 +602,12 @@ static int stat_show(struct seq_file *s, void *v)
(si->base_mem + si->cache_mem + si->page_mem) >> 10);
seq_printf(s, " - static: %llu KB\n",
si->base_mem >> 10);
- seq_printf(s, " - cached: %llu KB\n",
+ seq_printf(s, " - cached all: %llu KB\n",
si->cache_mem >> 10);
+ seq_printf(s, " - read extent cache: %llu KB\n",
+ si->ext_mem[EX_READ] >> 10);
+ seq_printf(s, " - block age extent cache: %llu KB\n",
+ si->ext_mem[EX_BLOCK_AGE] >> 10);
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
@@ -600,10 +640,15 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->sbi = sbi;
sbi->stat_info = si;
- atomic64_set(&sbi->total_hit_ext, 0);
- atomic64_set(&sbi->read_hit_rbtree, 0);
+ /* general extent cache stats */
+ for (i = 0; i < NR_EXTENT_CACHES; i++) {
+ atomic64_set(&sbi->total_hit_ext[i], 0);
+ atomic64_set(&sbi->read_hit_rbtree[i], 0);
+ atomic64_set(&sbi->read_hit_cached[i], 0);
+ }
+
+ /* read extent_cache only */
atomic64_set(&sbi->read_hit_largest, 0);
- atomic64_set(&sbi->read_hit_cached, 0);
atomic_set(&sbi->inline_xattr, 0);
atomic_set(&sbi->inline_inode, 0);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 21960a899b6a..8e025157f35c 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -340,6 +340,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
unsigned int bidx, end_block;
struct page *dentry_page;
struct f2fs_dir_entry *de = NULL;
+ pgoff_t next_pgofs;
bool room = false;
int max_slots;
@@ -350,12 +351,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
le32_to_cpu(fname->hash) % nbucket);
end_block = bidx + nblock;
- for (; bidx < end_block; bidx++) {
+ while (bidx < end_block) {
/* no need to allocate new dentry pages to all the indices */
- dentry_page = f2fs_find_data_page(dir, bidx);
+ dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT) {
room = true;
+ bidx = next_pgofs;
continue;
} else {
*res_page = dentry_page;
@@ -376,6 +378,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
if (max_slots >= s)
room = true;
f2fs_put_page(dentry_page, 0);
+
+ bidx++;
}
if (!de && room && F2FS_I(dir)->chash != fname->hash) {
@@ -956,7 +960,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
bool f2fs_empty_dir(struct inode *dir)
{
- unsigned long bidx;
+ unsigned long bidx = 0;
struct page *dentry_page;
unsigned int bit_pos;
struct f2fs_dentry_block *dentry_blk;
@@ -965,13 +969,17 @@ bool f2fs_empty_dir(struct inode *dir)
if (f2fs_has_inline_dentry(dir))
return f2fs_empty_inline_dir(dir);
- for (bidx = 0; bidx < nblock; bidx++) {
- dentry_page = f2fs_get_lock_data_page(dir, bidx, false);
+ while (bidx < nblock) {
+ pgoff_t next_pgofs;
+
+ dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
if (IS_ERR(dentry_page)) {
- if (PTR_ERR(dentry_page) == -ENOENT)
+ if (PTR_ERR(dentry_page) == -ENOENT) {
+ bidx = next_pgofs;
continue;
- else
+ } else {
return false;
+ }
}
dentry_blk = page_address(dentry_page);
@@ -983,10 +991,12 @@ bool f2fs_empty_dir(struct inode *dir)
NR_DENTRY_IN_BLOCK,
bit_pos);
- f2fs_put_page(dentry_page, 1);
+ f2fs_put_page(dentry_page, 0);
if (bit_pos < NR_DENTRY_IN_BLOCK)
return false;
+
+ bidx++;
}
return true;
}
@@ -1000,7 +1010,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode);
struct blk_plug plug;
- bool readdir_ra = sbi->readdir_ra == 1;
+ bool readdir_ra = sbi->readdir_ra;
bool found_valid_dirent = false;
int err = 0;
@@ -1104,7 +1114,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
goto out_free;
}
- for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+ for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+ pgoff_t next_pgofs;
/* allow readdir() to be interrupted */
if (fatal_signal_pending(current)) {
@@ -1118,11 +1129,12 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
page_cache_sync_readahead(inode->i_mapping, ra, file, n,
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
- dentry_page = f2fs_find_data_page(inode, n);
+ dentry_page = f2fs_find_data_page(inode, n, &next_pgofs);
if (IS_ERR(dentry_page)) {
err = PTR_ERR(dentry_page);
if (err == -ENOENT) {
err = 0;
+ n = next_pgofs;
continue;
} else {
goto out_free;
@@ -1141,6 +1153,8 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
}
f2fs_put_page(dentry_page, 0);
+
+ n++;
}
out_free:
fscrypt_fname_free_buffer(&fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 932c070173b9..342af24b2f8c 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -6,6 +6,10 @@
* Copyright (c) 2015 Samsung Electronics
* Authors: Jaegeuk Kim <jaegeuk@kernel.org>
* Chao Yu <chao2.yu@samsung.com>
+ *
+ * block_age-based extent cache added by:
+ * Copyright (c) 2022 xiaomi Co., Ltd.
+ * http://www.xiaomi.com/
*/
#include <linux/fs.h>
@@ -15,6 +19,123 @@
#include "node.h"
#include <trace/events/f2fs.h>
+static void __set_extent_info(struct extent_info *ei,
+ unsigned int fofs, unsigned int len,
+ block_t blk, bool keep_clen,
+ unsigned long age, unsigned long last_blocks,
+ enum extent_type type)
+{
+ ei->fofs = fofs;
+ ei->len = len;
+
+ if (type == EX_READ) {
+ ei->blk = blk;
+ if (keep_clen)
+ return;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ ei->c_len = 0;
+#endif
+ } else if (type == EX_BLOCK_AGE) {
+ ei->age = age;
+ ei->last_blocks = last_blocks;
+ }
+}
+
+static bool __may_read_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, READ_EXTENT_CACHE))
+ return false;
+ if (is_inode_flag_set(inode, FI_NO_EXTENT))
+ return false;
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
+ !f2fs_sb_has_readonly(sbi))
+ return false;
+ return S_ISREG(inode->i_mode);
+}
+
+static bool __may_age_extent_tree(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (!test_opt(sbi, AGE_EXTENT_CACHE))
+ return false;
+ /* don't cache block age info for cold file */
+ if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
+ return false;
+ if (file_is_cold(inode))
+ return false;
+
+ return S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode);
+}
+
+static bool __init_may_extent_tree(struct inode *inode, enum extent_type type)
+{
+ if (type == EX_READ)
+ return __may_read_extent_tree(inode);
+ else if (type == EX_BLOCK_AGE)
+ return __may_age_extent_tree(inode);
+ return false;
+}
+
+static bool __may_extent_tree(struct inode *inode, enum extent_type type)
+{
+ /*
+ * for recovered files during mount do not create extents
+ * if shrinker is not registered.
+ */
+ if (list_empty(&F2FS_I_SB(inode)->s_list))
+ return false;
+
+ return __init_may_extent_tree(inode, type);
+}
+
+static void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
+{
+ if (et->type != EX_READ)
+ return;
+ if (en->ei.len <= et->largest.len)
+ return;
+
+ et->largest = en->ei;
+ et->largest_updated = true;
+}
+
+static bool __is_extent_mergeable(struct extent_info *back,
+ struct extent_info *front, enum extent_type type)
+{
+ if (type == EX_READ) {
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (back->c_len && back->len != back->c_len)
+ return false;
+ if (front->c_len && front->len != front->c_len)
+ return false;
+#endif
+ return (back->fofs + back->len == front->fofs &&
+ back->blk + back->len == front->blk);
+ } else if (type == EX_BLOCK_AGE) {
+ return (back->fofs + back->len == front->fofs &&
+ abs(back->age - front->age) <= SAME_AGE_REGION &&
+ abs(back->last_blocks - front->last_blocks) <=
+ SAME_AGE_REGION);
+ }
+ return false;
+}
+
+static bool __is_back_mergeable(struct extent_info *cur,
+ struct extent_info *back, enum extent_type type)
+{
+ return __is_extent_mergeable(back, cur, type);
+}
+
+static bool __is_front_mergeable(struct extent_info *cur,
+ struct extent_info *front, enum extent_type type)
+{
+ return __is_extent_mergeable(cur, front, type);
+}
+
static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re,
unsigned int ofs)
{
@@ -237,6 +358,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
struct rb_node *parent, struct rb_node **p,
bool leftmost)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct extent_node *en;
en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi);
@@ -250,16 +372,18 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
rb_link_node(&en->rb_node, parent, p);
rb_insert_color_cached(&en->rb_node, &et->root, leftmost);
atomic_inc(&et->node_cnt);
- atomic_inc(&sbi->total_ext_node);
+ atomic_inc(&eti->total_ext_node);
return en;
}
static void __detach_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
rb_erase_cached(&en->rb_node, &et->root);
atomic_dec(&et->node_cnt);
- atomic_dec(&sbi->total_ext_node);
+ atomic_dec(&eti->total_ext_node);
if (et->cached_en == en)
et->cached_en = NULL;
@@ -275,61 +399,51 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
static void __release_extent_node(struct f2fs_sb_info *sbi,
struct extent_tree *et, struct extent_node *en)
{
- spin_lock(&sbi->extent_lock);
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
+
+ spin_lock(&eti->extent_lock);
f2fs_bug_on(sbi, list_empty(&en->list));
list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
__detach_extent_node(sbi, et, en);
}
-static struct extent_tree *__grab_extent_tree(struct inode *inode)
+static struct extent_tree *__grab_extent_tree(struct inode *inode,
+ enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
struct extent_tree *et;
nid_t ino = inode->i_ino;
- mutex_lock(&sbi->extent_tree_lock);
- et = radix_tree_lookup(&sbi->extent_tree_root, ino);
+ mutex_lock(&eti->extent_tree_lock);
+ et = radix_tree_lookup(&eti->extent_tree_root, ino);
if (!et) {
et = f2fs_kmem_cache_alloc(extent_tree_slab,
GFP_NOFS, true, NULL);
- f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
+ f2fs_radix_tree_insert(&eti->extent_tree_root, ino, et);
memset(et, 0, sizeof(struct extent_tree));
et->ino = ino;
+ et->type = type;
et->root = RB_ROOT_CACHED;
et->cached_en = NULL;
rwlock_init(&et->lock);
INIT_LIST_HEAD(&et->list);
atomic_set(&et->node_cnt, 0);
- atomic_inc(&sbi->total_ext_tree);
+ atomic_inc(&eti->total_ext_tree);
} else {
- atomic_dec(&sbi->total_zombie_tree);
+ atomic_dec(&eti->total_zombie_tree);
list_del_init(&et->list);
}
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
/* never died until evict_inode */
- F2FS_I(inode)->extent_tree = et;
+ F2FS_I(inode)->extent_tree[type] = et;
return et;
}
-static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi,
- struct extent_tree *et, struct extent_info *ei)
-{
- struct rb_node **p = &et->root.rb_root.rb_node;
- struct extent_node *en;
-
- en = __attach_extent_node(sbi, et, ei, NULL, p, true);
- if (!en)
- return NULL;
-
- et->largest = en->ei;
- et->cached_en = en;
- return en;
-}
-
static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
struct extent_tree *et)
{
@@ -358,70 +472,89 @@ static void __drop_largest_extent(struct extent_tree *et,
}
}
-/* return true, if inode page is changed */
-static void __f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_extent *i_ext = ipage ? &F2FS_INODE(ipage)->i_ext : NULL;
+ struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
+ struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
struct extent_tree *et;
struct extent_node *en;
struct extent_info ei;
- if (!f2fs_may_extent_tree(inode)) {
- /* drop largest extent */
+ if (!__may_extent_tree(inode, EX_READ)) {
+ /* drop largest read extent */
if (i_ext && i_ext->len) {
f2fs_wait_on_page_writeback(ipage, NODE, true, true);
i_ext->len = 0;
set_page_dirty(ipage);
- return;
}
- return;
+ goto out;
}
- et = __grab_extent_tree(inode);
+ et = __grab_extent_tree(inode, EX_READ);
if (!i_ext || !i_ext->len)
- return;
+ goto out;
- get_extent_info(&ei, i_ext);
+ get_read_extent_info(&ei, i_ext);
write_lock(&et->lock);
if (atomic_read(&et->node_cnt))
- goto out;
+ goto unlock_out;
- en = __init_extent_tree(sbi, et, &ei);
+ en = __attach_extent_node(sbi, et, &ei, NULL,
+ &et->root.rb_root.rb_node, true);
if (en) {
- spin_lock(&sbi->extent_lock);
- list_add_tail(&en->list, &sbi->extent_list);
- spin_unlock(&sbi->extent_lock);
+ et->largest = en->ei;
+ et->cached_en = en;
+
+ spin_lock(&eti->extent_lock);
+ list_add_tail(&en->list, &eti->extent_list);
+ spin_unlock(&eti->extent_lock);
}
-out:
+unlock_out:
write_unlock(&et->lock);
+out:
+ if (!F2FS_I(inode)->extent_tree[EX_READ])
+ set_inode_flag(inode, FI_NO_EXTENT);
}
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_age_extent_tree(struct inode *inode)
{
- __f2fs_init_extent_tree(inode, ipage);
+ if (!__init_may_extent_tree(inode, EX_BLOCK_AGE))
+ return;
+ __grab_extent_tree(inode, EX_BLOCK_AGE);
+}
- if (!F2FS_I(inode)->extent_tree)
- set_inode_flag(inode, FI_NO_EXTENT);
+void f2fs_init_extent_tree(struct inode *inode)
+{
+ /* initialize read cache */
+ if (__init_may_extent_tree(inode, EX_READ))
+ __grab_extent_tree(inode, EX_READ);
+
+ /* initialize block age cache */
+ if (__init_may_extent_tree(inode, EX_BLOCK_AGE))
+ __grab_extent_tree(inode, EX_BLOCK_AGE);
}
-static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
+static bool __lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
struct extent_node *en;
bool ret = false;
- f2fs_bug_on(sbi, !et);
+ if (!et)
+ return false;
- trace_f2fs_lookup_extent_tree_start(inode, pgofs);
+ trace_f2fs_lookup_extent_tree_start(inode, pgofs, type);
read_lock(&et->lock);
- if (et->largest.fofs <= pgofs &&
+ if (type == EX_READ &&
+ et->largest.fofs <= pgofs &&
et->largest.fofs + et->largest.len > pgofs) {
*ei = et->largest;
ret = true;
@@ -435,23 +568,26 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
goto out;
if (en == et->cached_en)
- stat_inc_cached_node_hit(sbi);
+ stat_inc_cached_node_hit(sbi, type);
else
- stat_inc_rbtree_node_hit(sbi);
+ stat_inc_rbtree_node_hit(sbi, type);
*ei = en->ei;
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
et->cached_en = en;
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
ret = true;
out:
- stat_inc_total_hit(sbi);
+ stat_inc_total_hit(sbi, type);
read_unlock(&et->lock);
- trace_f2fs_lookup_extent_tree_end(inode, pgofs, ei);
+ if (type == EX_READ)
+ trace_f2fs_lookup_read_extent_tree_end(inode, pgofs, ei);
+ else if (type == EX_BLOCK_AGE)
+ trace_f2fs_lookup_age_extent_tree_end(inode, pgofs, ei);
return ret;
}
@@ -460,18 +596,20 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
struct extent_node *prev_ex,
struct extent_node *next_ex)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct extent_node *en = NULL;
- if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) {
+ if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei, et->type)) {
prev_ex->ei.len += ei->len;
ei = &prev_ex->ei;
en = prev_ex;
}
- if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) {
+ if (next_ex && __is_front_mergeable(ei, &next_ex->ei, et->type)) {
next_ex->ei.fofs = ei->fofs;
- next_ex->ei.blk = ei->blk;
next_ex->ei.len += ei->len;
+ if (et->type == EX_READ)
+ next_ex->ei.blk = ei->blk;
if (en)
__release_extent_node(sbi, et, prev_ex);
@@ -483,12 +621,12 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
__try_update_largest_extent(et, en);
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
if (!list_empty(&en->list)) {
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
et->cached_en = en;
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
return en;
}
@@ -498,6 +636,7 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
struct rb_node *insert_parent,
bool leftmost)
{
+ struct extent_tree_info *eti = &sbi->extent_tree[et->type];
struct rb_node **p;
struct rb_node *parent = NULL;
struct extent_node *en = NULL;
@@ -520,47 +659,54 @@ do_insert:
__try_update_largest_extent(et, en);
/* update in global extent list */
- spin_lock(&sbi->extent_lock);
- list_add_tail(&en->list, &sbi->extent_list);
+ spin_lock(&eti->extent_lock);
+ list_add_tail(&en->list, &eti->extent_list);
et->cached_en = en;
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
return en;
}
-static void f2fs_update_extent_tree_range(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int len)
+static void __update_extent_tree_range(struct inode *inode,
+ struct extent_info *tei, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
struct extent_node *en = NULL, *en1 = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei, dei, prev;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
+ unsigned int fofs = tei->fofs, len = tei->len;
unsigned int end = fofs + len;
- unsigned int pos = (unsigned int)fofs;
bool updated = false;
bool leftmost = false;
if (!et)
return;
- trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len, 0);
+ if (type == EX_READ)
+ trace_f2fs_update_read_extent_tree_range(inode, fofs, len,
+ tei->blk, 0);
+ else if (type == EX_BLOCK_AGE)
+ trace_f2fs_update_age_extent_tree_range(inode, fofs, len,
+ tei->age, tei->last_blocks);
write_lock(&et->lock);
- if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
- write_unlock(&et->lock);
- return;
- }
+ if (type == EX_READ) {
+ if (is_inode_flag_set(inode, FI_NO_EXTENT)) {
+ write_unlock(&et->lock);
+ return;
+ }
- prev = et->largest;
- dei.len = 0;
+ prev = et->largest;
+ dei.len = 0;
- /*
- * drop largest extent before lookup, in case it's already
- * been shrunk from extent tree
- */
- __drop_largest_extent(et, fofs, len);
+ /*
+ * drop largest extent before lookup, in case it's already
+ * been shrunk from extent tree
+ */
+ __drop_largest_extent(et, fofs, len);
+ }
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root,
@@ -581,26 +727,32 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
dei = en->ei;
org_end = dei.fofs + dei.len;
- f2fs_bug_on(sbi, pos >= org_end);
+ f2fs_bug_on(sbi, fofs >= org_end);
- if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
- en->ei.len = pos - en->ei.fofs;
+ if (fofs > dei.fofs && (type != EX_READ ||
+ fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN)) {
+ en->ei.len = fofs - en->ei.fofs;
prev_en = en;
parts = 1;
}
- if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+ if (end < org_end && (type != EX_READ ||
+ org_end - end >= F2FS_MIN_EXTENT_LEN)) {
if (parts) {
- set_extent_info(&ei, end,
- end - dei.fofs + dei.blk,
- org_end - end);
+ __set_extent_info(&ei,
+ end, org_end - end,
+ end - dei.fofs + dei.blk, false,
+ dei.age, dei.last_blocks,
+ type);
en1 = __insert_extent_tree(sbi, et, &ei,
NULL, NULL, true);
next_en = en1;
} else {
- en->ei.fofs = end;
- en->ei.blk += end - dei.fofs;
- en->ei.len -= end - dei.fofs;
+ __set_extent_info(&en->ei,
+ end, en->ei.len - (end - dei.fofs),
+ en->ei.blk + (end - dei.fofs), true,
+ dei.age, dei.last_blocks,
+ type);
next_en = en;
}
parts++;
@@ -630,10 +782,15 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
en = next_en;
}
- /* 3. update extent in extent cache */
- if (blkaddr) {
+ if (type == EX_BLOCK_AGE)
+ goto update_age_extent_cache;
- set_extent_info(&ei, fofs, blkaddr, len);
+ /* 3. update extent in read extent cache */
+ BUG_ON(type != EX_READ);
+
+ if (tei->blk) {
+ __set_extent_info(&ei, fofs, len, tei->blk, false,
+ 0, 0, EX_READ);
if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
__insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent, leftmost);
@@ -655,7 +812,17 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
et->largest_updated = false;
updated = true;
}
+ goto out_read_extent_cache;
+update_age_extent_cache:
+ if (!tei->last_blocks)
+ goto out_read_extent_cache;
+ __set_extent_info(&ei, fofs, len, 0, false,
+ tei->age, tei->last_blocks, EX_BLOCK_AGE);
+ if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
+ __insert_extent_tree(sbi, et, &ei,
+ insert_p, insert_parent, leftmost);
+out_read_extent_cache:
write_unlock(&et->lock);
if (updated)
@@ -663,19 +830,20 @@ static void f2fs_update_extent_tree_range(struct inode *inode,
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
pgoff_t fofs, block_t blkaddr, unsigned int llen,
unsigned int c_len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
struct extent_node *en = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
bool leftmost = false;
- trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen, c_len);
+ trace_f2fs_update_read_extent_tree_range(inode, fofs, llen,
+ blkaddr, c_len);
/* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */
if (is_inode_flag_set(inode, FI_NO_EXTENT))
@@ -692,7 +860,7 @@ void f2fs_update_extent_tree_range_compressed(struct inode *inode,
if (en)
goto unlock_out;
- set_extent_info(&ei, fofs, blkaddr, llen);
+ __set_extent_info(&ei, fofs, llen, blkaddr, true, 0, 0, EX_READ);
ei.c_len = c_len;
if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en))
@@ -703,24 +871,114 @@ unlock_out:
}
#endif
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+static unsigned long long __calculate_block_age(unsigned long long new,
+ unsigned long long old)
+{
+ unsigned long long diff;
+
+ diff = (new >= old) ? new - (new - old) : new + (old - new);
+
+ return div_u64(diff * LAST_AGE_WEIGHT, 100);
+}
+
+/* This returns a new age and allocated blocks in ei */
+static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
+ block_t blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ loff_t f_size = i_size_read(inode);
+ unsigned long long cur_blocks =
+ atomic64_read(&sbi->allocated_data_blocks);
+ struct extent_info tei = *ei; /* only fofs and len are valid */
+
+ /*
+ * When I/O is not aligned to a PAGE_SIZE, update will happen to the last
+ * file block even in seq write. So don't record age for newly last file
+ * block here.
+ */
+ if ((f_size >> PAGE_SHIFT) == ei->fofs && f_size & (PAGE_SIZE - 1) &&
+ blkaddr == NEW_ADDR)
+ return -EINVAL;
+
+ if (__lookup_extent_tree(inode, ei->fofs, &tei, EX_BLOCK_AGE)) {
+ unsigned long long cur_age;
+
+ if (cur_blocks >= tei.last_blocks)
+ cur_age = cur_blocks - tei.last_blocks;
+ else
+ /* allocated_data_blocks overflow */
+ cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks;
+
+ if (tei.age)
+ ei->age = __calculate_block_age(cur_age, tei.age);
+ else
+ ei->age = cur_age;
+ ei->last_blocks = cur_blocks;
+ WARN_ON(ei->age > cur_blocks);
+ return 0;
+ }
+
+ f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
+
+ /* the data block was allocated for the first time */
+ if (blkaddr == NEW_ADDR)
+ goto out;
+
+ if (__is_valid_data_blkaddr(blkaddr) &&
+ !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
+ f2fs_bug_on(sbi, 1);
+ return -EINVAL;
+ }
+out:
+ /*
+ * init block age with zero, this can happen when the block age extent
+ * was reclaimed due to memory constraint or system reboot
+ */
+ ei->age = 0;
+ ei->last_blocks = cur_blocks;
+ return 0;
+}
+
+static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type type)
{
+ struct extent_info ei = {};
+
+ if (!__may_extent_tree(dn->inode, type))
+ return;
+
+ ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+ dn->ofs_in_node;
+ ei.len = 1;
+
+ if (type == EX_READ) {
+ if (dn->data_blkaddr == NEW_ADDR)
+ ei.blk = NULL_ADDR;
+ else
+ ei.blk = dn->data_blkaddr;
+ } else if (type == EX_BLOCK_AGE) {
+ if (__get_new_block_age(dn->inode, &ei, dn->data_blkaddr))
+ return;
+ }
+ __update_extent_tree_range(dn->inode, &ei, type);
+}
+
+static unsigned int __shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink,
+ enum extent_type type)
+{
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
struct extent_tree *et, *next;
struct extent_node *en;
unsigned int node_cnt = 0, tree_cnt = 0;
int remained;
- if (!test_opt(sbi, EXTENT_CACHE))
- return 0;
-
- if (!atomic_read(&sbi->total_zombie_tree))
+ if (!atomic_read(&eti->total_zombie_tree))
goto free_node;
- if (!mutex_trylock(&sbi->extent_tree_lock))
+ if (!mutex_trylock(&eti->extent_tree_lock))
goto out;
/* 1. remove unreferenced extent tree */
- list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+ list_for_each_entry_safe(et, next, &eti->zombie_list, list) {
if (atomic_read(&et->node_cnt)) {
write_lock(&et->lock);
node_cnt += __free_extent_tree(sbi, et);
@@ -728,61 +986,137 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
}
f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
list_del_init(&et->list);
- radix_tree_delete(&sbi->extent_tree_root, et->ino);
+ radix_tree_delete(&eti->extent_tree_root, et->ino);
kmem_cache_free(extent_tree_slab, et);
- atomic_dec(&sbi->total_ext_tree);
- atomic_dec(&sbi->total_zombie_tree);
+ atomic_dec(&eti->total_ext_tree);
+ atomic_dec(&eti->total_zombie_tree);
tree_cnt++;
if (node_cnt + tree_cnt >= nr_shrink)
goto unlock_out;
cond_resched();
}
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
free_node:
/* 2. remove LRU extent entries */
- if (!mutex_trylock(&sbi->extent_tree_lock))
+ if (!mutex_trylock(&eti->extent_tree_lock))
goto out;
remained = nr_shrink - (node_cnt + tree_cnt);
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
for (; remained > 0; remained--) {
- if (list_empty(&sbi->extent_list))
+ if (list_empty(&eti->extent_list))
break;
- en = list_first_entry(&sbi->extent_list,
+ en = list_first_entry(&eti->extent_list,
struct extent_node, list);
et = en->et;
if (!write_trylock(&et->lock)) {
/* refresh this extent node's position in extent list */
- list_move_tail(&en->list, &sbi->extent_list);
+ list_move_tail(&en->list, &eti->extent_list);
continue;
}
list_del_init(&en->list);
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
__detach_extent_node(sbi, et, en);
write_unlock(&et->lock);
node_cnt++;
- spin_lock(&sbi->extent_lock);
+ spin_lock(&eti->extent_lock);
}
- spin_unlock(&sbi->extent_lock);
+ spin_unlock(&eti->extent_lock);
unlock_out:
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_unlock(&eti->extent_tree_lock);
out:
- trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
+ trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt, type);
return node_cnt + tree_cnt;
}
-unsigned int f2fs_destroy_extent_node(struct inode *inode)
+/* read extent cache operations */
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (!__may_extent_tree(inode, EX_READ))
+ return false;
+
+ return __lookup_extent_tree(inode, pgofs, ei, EX_READ);
+}
+
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn)
+{
+ return __update_extent_cache(dn, EX_READ);
+}
+
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len)
+{
+ struct extent_info ei = {
+ .fofs = fofs,
+ .len = len,
+ .blk = blkaddr,
+ };
+
+ if (!__may_extent_tree(dn->inode, EX_READ))
+ return;
+
+ __update_extent_tree_range(dn->inode, &ei, EX_READ);
+}
+
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ if (!test_opt(sbi, READ_EXTENT_CACHE))
+ return 0;
+
+ return __shrink_extent_tree(sbi, nr_shrink, EX_READ);
+}
+
+/* block age extent cache operations */
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei)
+{
+ if (!__may_extent_tree(inode, EX_BLOCK_AGE))
+ return false;
+
+ return __lookup_extent_tree(inode, pgofs, ei, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn)
+{
+ return __update_extent_cache(dn, EX_BLOCK_AGE);
+}
+
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, unsigned int len)
+{
+ struct extent_info ei = {
+ .fofs = fofs,
+ .len = len,
+ };
+
+ if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE))
+ return;
+
+ __update_extent_tree_range(dn->inode, &ei, EX_BLOCK_AGE);
+}
+
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
+{
+ if (!test_opt(sbi, AGE_EXTENT_CACHE))
+ return 0;
+
+ return __shrink_extent_tree(sbi, nr_shrink, EX_BLOCK_AGE);
+}
+
+static unsigned int __destroy_extent_node(struct inode *inode,
+ enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
unsigned int node_cnt = 0;
if (!et || !atomic_read(&et->node_cnt))
@@ -795,31 +1129,46 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
return node_cnt;
}
-void f2fs_drop_extent_tree(struct inode *inode)
+void f2fs_destroy_extent_node(struct inode *inode)
+{
+ __destroy_extent_node(inode, EX_READ);
+ __destroy_extent_node(inode, EX_BLOCK_AGE);
+}
+
+static void __drop_extent_tree(struct inode *inode, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
bool updated = false;
- if (!f2fs_may_extent_tree(inode))
+ if (!__may_extent_tree(inode, type))
return;
write_lock(&et->lock);
- set_inode_flag(inode, FI_NO_EXTENT);
__free_extent_tree(sbi, et);
- if (et->largest.len) {
- et->largest.len = 0;
- updated = true;
+ if (type == EX_READ) {
+ set_inode_flag(inode, FI_NO_EXTENT);
+ if (et->largest.len) {
+ et->largest.len = 0;
+ updated = true;
+ }
}
write_unlock(&et->lock);
if (updated)
f2fs_mark_inode_dirty_sync(inode, true);
}
-void f2fs_destroy_extent_tree(struct inode *inode)
+void f2fs_drop_extent_tree(struct inode *inode)
+{
+ __drop_extent_tree(inode, EX_READ);
+ __drop_extent_tree(inode, EX_BLOCK_AGE);
+}
+
+static void __destroy_extent_tree(struct inode *inode, enum extent_type type)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[type];
unsigned int node_cnt = 0;
if (!et)
@@ -827,76 +1176,56 @@ void f2fs_destroy_extent_tree(struct inode *inode)
if (inode->i_nlink && !is_bad_inode(inode) &&
atomic_read(&et->node_cnt)) {
- mutex_lock(&sbi->extent_tree_lock);
- list_add_tail(&et->list, &sbi->zombie_list);
- atomic_inc(&sbi->total_zombie_tree);
- mutex_unlock(&sbi->extent_tree_lock);
+ mutex_lock(&eti->extent_tree_lock);
+ list_add_tail(&et->list, &eti->zombie_list);
+ atomic_inc(&eti->total_zombie_tree);
+ mutex_unlock(&eti->extent_tree_lock);
return;
}
/* free all extent info belong to this extent tree */
- node_cnt = f2fs_destroy_extent_node(inode);
+ node_cnt = __destroy_extent_node(inode, type);
/* delete extent tree entry in radix tree */
- mutex_lock(&sbi->extent_tree_lock);
+ mutex_lock(&eti->extent_tree_lock);
f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
- radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
+ radix_tree_delete(&eti->extent_tree_root, inode->i_ino);
kmem_cache_free(extent_tree_slab, et);
- atomic_dec(&sbi->total_ext_tree);
- mutex_unlock(&sbi->extent_tree_lock);
+ atomic_dec(&eti->total_ext_tree);
+ mutex_unlock(&eti->extent_tree_lock);
- F2FS_I(inode)->extent_tree = NULL;
+ F2FS_I(inode)->extent_tree[type] = NULL;
- trace_f2fs_destroy_extent_tree(inode, node_cnt);
+ trace_f2fs_destroy_extent_tree(inode, node_cnt, type);
}
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei)
-{
- if (!f2fs_may_extent_tree(inode))
- return false;
-
- return f2fs_lookup_extent_tree(inode, pgofs, ei);
-}
-
-void f2fs_update_extent_cache(struct dnode_of_data *dn)
+void f2fs_destroy_extent_tree(struct inode *inode)
{
- pgoff_t fofs;
- block_t blkaddr;
-
- if (!f2fs_may_extent_tree(dn->inode))
- return;
-
- if (dn->data_blkaddr == NEW_ADDR)
- blkaddr = NULL_ADDR;
- else
- blkaddr = dn->data_blkaddr;
-
- fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
- dn->ofs_in_node;
- f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1);
+ __destroy_extent_tree(inode, EX_READ);
+ __destroy_extent_tree(inode, EX_BLOCK_AGE);
}
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
- pgoff_t fofs, block_t blkaddr, unsigned int len)
-
+static void __init_extent_tree_info(struct extent_tree_info *eti)
{
- if (!f2fs_may_extent_tree(dn->inode))
- return;
-
- f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len);
+ INIT_RADIX_TREE(&eti->extent_tree_root, GFP_NOIO);
+ mutex_init(&eti->extent_tree_lock);
+ INIT_LIST_HEAD(&eti->extent_list);
+ spin_lock_init(&eti->extent_lock);
+ atomic_set(&eti->total_ext_tree, 0);
+ INIT_LIST_HEAD(&eti->zombie_list);
+ atomic_set(&eti->total_zombie_tree, 0);
+ atomic_set(&eti->total_ext_node, 0);
}
void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi)
{
- INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
- mutex_init(&sbi->extent_tree_lock);
- INIT_LIST_HEAD(&sbi->extent_list);
- spin_lock_init(&sbi->extent_lock);
- atomic_set(&sbi->total_ext_tree, 0);
- INIT_LIST_HEAD(&sbi->zombie_list);
- atomic_set(&sbi->total_zombie_tree, 0);
- atomic_set(&sbi->total_ext_node, 0);
+ __init_extent_tree_info(&sbi->extent_tree[EX_READ]);
+ __init_extent_tree_info(&sbi->extent_tree[EX_BLOCK_AGE]);
+
+ /* initialize for block age extents */
+ atomic64_set(&sbi->allocated_data_blocks, 0);
+ sbi->hot_data_age_threshold = DEF_HOT_DATA_AGE_THRESHOLD;
+ sbi->warm_data_age_threshold = DEF_WARM_DATA_AGE_THRESHOLD;
}
int __init f2fs_create_extent_cache(void)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e6355a5683b7..e8953c3dc81a 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -60,6 +60,7 @@ enum {
FAULT_SLAB_ALLOC,
FAULT_DQUOT_INIT,
FAULT_LOCK_OP,
+ FAULT_BLKADDR,
FAULT_MAX,
};
@@ -91,7 +92,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_FLUSH_MERGE 0x00000400
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
-#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
+#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00002000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
#define F2FS_MOUNT_USRQUOTA 0x00080000
@@ -106,6 +107,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000
#define F2FS_MOUNT_GC_MERGE 0x20000000
#define F2FS_MOUNT_COMPRESS_CACHE 0x40000000
+#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x80000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -202,10 +204,6 @@ struct f2fs_mount_info {
#define __F2FS_HAS_FEATURE(raw_super, mask) \
((raw_super->feature & cpu_to_le32(mask)) != 0)
#define F2FS_HAS_FEATURE(sbi, mask) __F2FS_HAS_FEATURE(sbi->raw_super, mask)
-#define F2FS_SET_FEATURE(sbi, mask) \
- (sbi->raw_super->feature |= cpu_to_le32(mask))
-#define F2FS_CLEAR_FEATURE(sbi, mask) \
- (sbi->raw_super->feature &= ~cpu_to_le32(mask))
/*
* Default values for user and/or group using reserved blocks
@@ -328,8 +326,12 @@ struct discard_entry {
unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */
};
+/* minimum discard granularity, unit: block count */
+#define MIN_DISCARD_GRANULARITY 1
/* default discard granularity of inner discard thread, unit: block count */
#define DEFAULT_DISCARD_GRANULARITY 16
+/* default maximum discard granularity of ordered discard, unit: block count */
+#define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16
/* max discard pend list number */
#define MAX_PLIST_NUM 512
@@ -408,7 +410,9 @@ struct discard_cmd_control {
unsigned int min_discard_issue_time; /* min. interval between discard issue */
unsigned int mid_discard_issue_time; /* mid. interval between discard issue */
unsigned int max_discard_issue_time; /* max. interval between discard issue */
+ unsigned int discard_urgent_util; /* utilization which issue discard proactively */
unsigned int discard_granularity; /* discard granularity */
+ unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
atomic_t issued_discard; /* # of issued discard */
@@ -593,16 +597,35 @@ enum {
/* dirty segments threshold for triggering CP */
#define DEFAULT_DIRTY_THRESHOLD 4
+#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
+#define RECOVERY_MIN_RA_BLOCKS 1
+
+#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */
+
/* for in-memory extent cache entry */
#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
/* number of extent info in extent cache we try to shrink */
-#define EXTENT_CACHE_SHRINK_NUMBER 128
+#define READ_EXTENT_CACHE_SHRINK_NUMBER 128
-#define RECOVERY_MAX_RA_BLOCKS BIO_MAX_VECS
-#define RECOVERY_MIN_RA_BLOCKS 1
+/* number of age extent info in extent cache we try to shrink */
+#define AGE_EXTENT_CACHE_SHRINK_NUMBER 128
+#define LAST_AGE_WEIGHT 30
+#define SAME_AGE_REGION 1024
-#define F2FS_ONSTACK_PAGES 16 /* nr of onstack pages */
+/*
+ * Define data block with age less than 1GB as hot data
+ * define data block with age less than 10GB but more than 1GB as warm data
+ */
+#define DEF_HOT_DATA_AGE_THRESHOLD 262144
+#define DEF_WARM_DATA_AGE_THRESHOLD 2621440
+
+/* extent cache type */
+enum extent_type {
+ EX_READ,
+ EX_BLOCK_AGE,
+ NR_EXTENT_CACHES,
+};
struct rb_entry {
struct rb_node rb_node; /* rb node located in rb-tree */
@@ -618,10 +641,24 @@ struct rb_entry {
struct extent_info {
unsigned int fofs; /* start offset in a file */
unsigned int len; /* length of the extent */
- u32 blk; /* start block address of the extent */
+ union {
+ /* read extent_cache */
+ struct {
+ /* start block address of the extent */
+ block_t blk;
#ifdef CONFIG_F2FS_FS_COMPRESSION
- unsigned int c_len; /* physical extent length of compressed blocks */
+ /* physical extent length of compressed blocks */
+ unsigned int c_len;
#endif
+ };
+ /* block age extent_cache */
+ struct {
+ /* block age of the extent */
+ unsigned long long age;
+ /* last total blocks allocated */
+ unsigned long long last_blocks;
+ };
+ };
};
struct extent_node {
@@ -633,13 +670,25 @@ struct extent_node {
struct extent_tree {
nid_t ino; /* inode number */
+ enum extent_type type; /* keep the extent tree type */
struct rb_root_cached root; /* root of extent info rb-tree */
struct extent_node *cached_en; /* recently accessed extent node */
- struct extent_info largest; /* largested extent info */
struct list_head list; /* to be used by sbi->zombie_list */
rwlock_t lock; /* protect extent info rb-tree */
atomic_t node_cnt; /* # of extent node in rb-tree*/
bool largest_updated; /* largest extent updated */
+ struct extent_info largest; /* largest cached extent for EX_READ */
+};
+
+struct extent_tree_info {
+ struct radix_tree_root extent_tree_root;/* cache extent cache entries */
+ struct mutex extent_tree_lock; /* locking extent radix tree */
+ struct list_head extent_list; /* lru list for shrinker */
+ spinlock_t extent_lock; /* locking extent lru list */
+ atomic_t total_ext_tree; /* extent tree count */
+ struct list_head zombie_list; /* extent zombie tree list */
+ atomic_t total_zombie_tree; /* extent zombie tree count */
+ atomic_t total_ext_node; /* extent info count */
};
/*
@@ -764,6 +813,8 @@ enum {
FI_COMPRESS_RELEASED, /* compressed blocks were released */
FI_ALIGNED_WRITE, /* enable aligned write */
FI_COW_FILE, /* indicate COW file */
+ FI_ATOMIC_COMMITTED, /* indicate atomic commit completed except disk sync */
+ FI_ATOMIC_REPLACE, /* indicate atomic replace */
FI_MAX, /* max flag, never be used */
};
@@ -800,7 +851,8 @@ struct f2fs_inode_info {
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
struct task_struct *atomic_write_task; /* store atomic write task */
- struct extent_tree *extent_tree; /* cached extent_tree entry */
+ struct extent_tree *extent_tree[NR_EXTENT_CACHES];
+ /* cached extent_tree entry */
struct inode *cow_inode; /* copy-on-write inode for atomic write */
/* avoid racing between foreground op and gc */
@@ -822,9 +874,10 @@ struct f2fs_inode_info {
unsigned int i_cluster_size; /* cluster size */
unsigned int atomic_write_cnt;
+ loff_t original_i_size; /* original i_size before atomic write */
};
-static inline void get_extent_info(struct extent_info *ext,
+static inline void get_read_extent_info(struct extent_info *ext,
struct f2fs_extent *i_ext)
{
ext->fofs = le32_to_cpu(i_ext->fofs);
@@ -832,7 +885,7 @@ static inline void get_extent_info(struct extent_info *ext,
ext->len = le32_to_cpu(i_ext->len);
}
-static inline void set_raw_extent(struct extent_info *ext,
+static inline void set_raw_read_extent(struct extent_info *ext,
struct f2fs_extent *i_ext)
{
i_ext->fofs = cpu_to_le32(ext->fofs);
@@ -840,17 +893,6 @@ static inline void set_raw_extent(struct extent_info *ext,
i_ext->len = cpu_to_le32(ext->len);
}
-static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
- u32 blk, unsigned int len)
-{
- ei->fofs = fofs;
- ei->blk = blk;
- ei->len = len;
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- ei->c_len = 0;
-#endif
-}
-
static inline bool __is_discard_mergeable(struct discard_info *back,
struct discard_info *front, unsigned int max_len)
{
@@ -870,41 +912,6 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur,
return __is_discard_mergeable(cur, front, max_len);
}
-static inline bool __is_extent_mergeable(struct extent_info *back,
- struct extent_info *front)
-{
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (back->c_len && back->len != back->c_len)
- return false;
- if (front->c_len && front->len != front->c_len)
- return false;
-#endif
- return (back->fofs + back->len == front->fofs &&
- back->blk + back->len == front->blk);
-}
-
-static inline bool __is_back_mergeable(struct extent_info *cur,
- struct extent_info *back)
-{
- return __is_extent_mergeable(back, cur);
-}
-
-static inline bool __is_front_mergeable(struct extent_info *cur,
- struct extent_info *front)
-{
- return __is_extent_mergeable(cur, front);
-}
-
-extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
-static inline void __try_update_largest_extent(struct extent_tree *et,
- struct extent_node *en)
-{
- if (en->ei.len > et->largest.len) {
- et->largest = en->ei;
- et->largest_updated = true;
- }
-}
-
/*
* For free nid management
*/
@@ -1062,9 +1069,6 @@ struct f2fs_sm_info {
/* a threshold to reclaim prefree segments */
unsigned int rec_prefree_segments;
- /* for batched trimming */
- unsigned int trim_sections; /* # of sections to trim */
-
struct list_head sit_entry_set; /* sit entry set list */
unsigned int ipu_policy; /* in-place-update policy */
@@ -1318,6 +1322,7 @@ enum {
MAX_TIME,
};
+/* Note that you need to keep synchronization with this gc_mode_names array */
enum {
GC_NORMAL,
GC_IDLE_CB,
@@ -1668,14 +1673,12 @@ struct f2fs_sb_info {
struct mutex flush_lock; /* for flush exclusion */
/* for extent tree cache */
- struct radix_tree_root extent_tree_root;/* cache extent cache entries */
- struct mutex extent_tree_lock; /* locking extent radix tree */
- struct list_head extent_list; /* lru list for shrinker */
- spinlock_t extent_lock; /* locking extent lru list */
- atomic_t total_ext_tree; /* extent tree count */
- struct list_head zombie_list; /* extent zombie tree list */
- atomic_t total_zombie_tree; /* extent zombie tree count */
- atomic_t total_ext_node; /* extent info count */
+ struct extent_tree_info extent_tree[NR_EXTENT_CACHES];
+ atomic64_t allocated_data_blocks; /* for block age extent_cache */
+
+ /* The threshold used for hot and warm data seperation*/
+ unsigned int hot_data_age_threshold;
+ unsigned int warm_data_age_threshold;
/* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
@@ -1693,7 +1696,7 @@ struct f2fs_sb_info {
unsigned int total_node_count; /* total node block count */
unsigned int total_valid_node_count; /* valid node block count */
int dir_level; /* directory level */
- int readdir_ra; /* readahead inode in readdir */
+ bool readdir_ra; /* readahead inode in readdir */
u64 max_io_bytes; /* max io bytes to merge IOs */
block_t user_block_count; /* # of user blocks */
@@ -1734,8 +1737,9 @@ struct f2fs_sb_info {
unsigned int cur_victim_sec; /* current victim section num */
unsigned int gc_mode; /* current GC state */
unsigned int next_victim_seg[2]; /* next segment in victim section */
- spinlock_t gc_urgent_high_lock;
- unsigned int gc_urgent_high_remaining; /* remaining trial count for GC_URGENT_HIGH */
+ spinlock_t gc_remaining_trials_lock;
+ /* remaining trial count for GC_URGENT_* and GC_IDLE_* */
+ unsigned int gc_remaining_trials;
/* for skip statistic */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
@@ -1759,10 +1763,14 @@ struct f2fs_sb_info {
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
- atomic64_t total_hit_ext; /* # of lookup extent cache */
- atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */
- atomic64_t read_hit_largest; /* # of hit largest extent node */
- atomic64_t read_hit_cached; /* # of hit cached extent node */
+ /* # of lookup extent cache */
+ atomic64_t total_hit_ext[NR_EXTENT_CACHES];
+ /* # of hit rbtree extent node */
+ atomic64_t read_hit_rbtree[NR_EXTENT_CACHES];
+ /* # of hit cached extent node */
+ atomic64_t read_hit_cached[NR_EXTENT_CACHES];
+ /* # of hit largest extent node in read extent cache */
+ atomic64_t read_hit_largest;
atomic_t inline_xattr; /* # of inline_xattr inodes */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
@@ -2576,6 +2584,7 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
}
+extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync);
static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
struct inode *inode, bool is_inode)
{
@@ -2974,7 +2983,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
/* Flags that should be inherited by new inodes from their parent. */
#define F2FS_FL_INHERITED (F2FS_SYNC_FL | F2FS_NODUMP_FL | F2FS_NOATIME_FL | \
F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
- F2FS_CASEFOLD_FL | F2FS_COMPR_FL | F2FS_NOCOMP_FL)
+ F2FS_CASEFOLD_FL)
/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define F2FS_REG_FLMASK (~(F2FS_DIRSYNC_FL | F2FS_PROJINHERIT_FL | \
@@ -3072,6 +3081,8 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
set_inode_flag(inode, FI_AUTO_RECOVER);
}
+static inline bool f2fs_is_atomic_file(struct inode *inode);
+
static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
{
bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE);
@@ -3081,6 +3092,10 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
return;
i_size_write(inode, i_size);
+
+ if (f2fs_is_atomic_file(inode))
+ return;
+
f2fs_mark_inode_dirty_sync(inode, true);
if (clean || recover)
set_inode_flag(inode, FI_AUTO_RECOVER);
@@ -3796,8 +3811,9 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write);
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index);
+ blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
+struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+ pgoff_t *next_pgofs);
struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
bool for_write);
struct page *f2fs_get_new_data_page(struct inode *inode,
@@ -3856,9 +3872,19 @@ struct f2fs_stat_info {
struct f2fs_sb_info *sbi;
int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
int main_area_segs, main_area_sections, main_area_zones;
- unsigned long long hit_largest, hit_cached, hit_rbtree;
- unsigned long long hit_total, total_ext;
- int ext_tree, zombie_tree, ext_node;
+ unsigned long long hit_cached[NR_EXTENT_CACHES];
+ unsigned long long hit_rbtree[NR_EXTENT_CACHES];
+ unsigned long long total_ext[NR_EXTENT_CACHES];
+ unsigned long long hit_total[NR_EXTENT_CACHES];
+ int ext_tree[NR_EXTENT_CACHES];
+ int zombie_tree[NR_EXTENT_CACHES];
+ int ext_node[NR_EXTENT_CACHES];
+ /* to count memory footprint */
+ unsigned long long ext_mem[NR_EXTENT_CACHES];
+ /* for read extent cache */
+ unsigned long long hit_largest;
+ /* for block age extent cache */
+ unsigned long long allocated_data_blocks;
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
int ndirty_data, ndirty_qdata;
unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
@@ -3917,10 +3943,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_other_skip_bggc_count(sbi) ((sbi)->other_skip_bggc++)
#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++)
#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--)
-#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
-#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_total_hit(sbi, type) (atomic64_inc(&(sbi)->total_hit_ext[type]))
+#define stat_inc_rbtree_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_rbtree[type]))
#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest))
-#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached))
+#define stat_inc_cached_node_hit(sbi, type) (atomic64_inc(&(sbi)->read_hit_cached[type]))
#define stat_inc_inline_xattr(inode) \
do { \
if (f2fs_has_inline_xattr(inode)) \
@@ -4043,10 +4069,10 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi);
#define stat_other_skip_bggc_count(sbi) do { } while (0)
#define stat_inc_dirty_inode(sbi, type) do { } while (0)
#define stat_dec_dirty_inode(sbi, type) do { } while (0)
-#define stat_inc_total_hit(sbi) do { } while (0)
-#define stat_inc_rbtree_node_hit(sbi) do { } while (0)
+#define stat_inc_total_hit(sbi, type) do { } while (0)
+#define stat_inc_rbtree_node_hit(sbi, type) do { } while (0)
#define stat_inc_largest_node_hit(sbi) do { } while (0)
-#define stat_inc_cached_node_hit(sbi) do { } while (0)
+#define stat_inc_cached_node_hit(sbi, type) do { } while (0)
#define stat_inc_inline_xattr(inode) do { } while (0)
#define stat_dec_inline_xattr(inode) do { } while (0)
#define stat_inc_inline_inode(inode) do { } while (0)
@@ -4152,20 +4178,34 @@ struct rb_entry *f2fs_lookup_rb_tree_ret(struct rb_root_cached *root,
bool force, bool *leftmost);
bool f2fs_check_rb_tree_consistence(struct f2fs_sb_info *sbi,
struct rb_root_cached *root, bool check_key);
-unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink);
-void f2fs_init_extent_tree(struct inode *inode, struct page *ipage);
+void f2fs_init_extent_tree(struct inode *inode);
void f2fs_drop_extent_tree(struct inode *inode);
-unsigned int f2fs_destroy_extent_node(struct inode *inode);
+void f2fs_destroy_extent_node(struct inode *inode);
void f2fs_destroy_extent_tree(struct inode *inode);
-bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
- struct extent_info *ei);
-void f2fs_update_extent_cache(struct dnode_of_data *dn);
-void f2fs_update_extent_cache_range(struct dnode_of_data *dn,
- pgoff_t fofs, block_t blkaddr, unsigned int len);
void f2fs_init_extent_cache_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_extent_cache(void);
void f2fs_destroy_extent_cache(void);
+/* read extent cache ops */
+void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage);
+bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei);
+void f2fs_update_read_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_read_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, block_t blkaddr, unsigned int len);
+unsigned int f2fs_shrink_read_extent_tree(struct f2fs_sb_info *sbi,
+ int nr_shrink);
+
+/* block age extent cache ops */
+void f2fs_init_age_extent_tree(struct inode *inode);
+bool f2fs_lookup_age_extent_cache(struct inode *inode, pgoff_t pgofs,
+ struct extent_info *ei);
+void f2fs_update_age_extent_cache(struct dnode_of_data *dn);
+void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn,
+ pgoff_t fofs, unsigned int len);
+unsigned int f2fs_shrink_age_extent_tree(struct f2fs_sb_info *sbi,
+ int nr_shrink);
+
/*
* sysfs.c
*/
@@ -4235,9 +4275,9 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
struct writeback_control *wbc,
enum iostat_type io_type);
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
-void f2fs_update_extent_tree_range_compressed(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int llen,
- unsigned int c_len);
+void f2fs_update_read_extent_tree_range_compressed(struct inode *inode,
+ pgoff_t fofs, block_t blkaddr,
+ unsigned int llen, unsigned int c_len);
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
bool is_readahead, bool for_write);
@@ -4318,9 +4358,10 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
nid_t ino) { }
#define inc_compr_inode_stat(inode) do { } while (0)
-static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode,
- pgoff_t fofs, block_t blkaddr, unsigned int llen,
- unsigned int c_len) { }
+static inline void f2fs_update_read_extent_tree_range_compressed(
+ struct inode *inode,
+ pgoff_t fofs, block_t blkaddr,
+ unsigned int llen, unsigned int c_len) { }
#endif
static inline int set_compress_context(struct inode *inode)
@@ -4371,7 +4412,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
}
#define F2FS_FEATURE_FUNCS(name, flagname) \
-static inline int f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
+static inline bool f2fs_sb_has_##name(struct f2fs_sb_info *sbi) \
{ \
return F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_##flagname); \
}
@@ -4391,26 +4432,6 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD);
F2FS_FEATURE_FUNCS(compression, COMPRESSION);
F2FS_FEATURE_FUNCS(readonly, RO);
-static inline bool f2fs_may_extent_tree(struct inode *inode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-
- if (!test_opt(sbi, EXTENT_CACHE) ||
- is_inode_flag_set(inode, FI_NO_EXTENT) ||
- (is_inode_flag_set(inode, FI_COMPRESSED_FILE) &&
- !f2fs_sb_has_readonly(sbi)))
- return false;
-
- /*
- * for recovered files during mount do not create extents
- * if shrinker is not registered.
- */
- if (list_empty(&sbi->s_list))
- return false;
-
- return S_ISREG(inode->i_mode);
-}
-
#ifdef CONFIG_BLK_DEV_ZONED
static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
block_t blkaddr)
@@ -4563,6 +4584,11 @@ static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, pgoff_t ofs,
}
}
+static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
+{
+ return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 83df6f6173d3..ecbc8c135b49 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -571,7 +571,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
raw_node = F2FS_NODE(dn->node_page);
addr = blkaddr_in_node(raw_node) + base + ofs;
- /* Assumption: truncateion starts with cluster */
+ /* Assumption: truncation starts with cluster */
for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
block_t blkaddr = le32_to_cpu(*addr);
@@ -618,7 +618,8 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
*/
fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
dn->inode) + ofs;
- f2fs_update_extent_cache_range(dn, fofs, 0, len);
+ f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
+ f2fs_update_age_extent_cache_range(dn, fofs, nr_free);
dec_valid_block_count(sbi, dn->inode, nr_free);
}
dn->ofs_in_node = ofs;
@@ -1496,7 +1497,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
f2fs_set_data_blkaddr(dn);
}
- f2fs_update_extent_cache_range(dn, start, 0, index - start);
+ f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
return ret;
}
@@ -1915,6 +1916,10 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
} else {
+ /* try to convert inline_data to support compression */
+ int err = f2fs_convert_inline_inode(inode);
+ if (err)
+ return err;
if (!f2fs_may_compress(inode))
return -EINVAL;
if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
@@ -2030,13 +2035,14 @@ static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
return put_user(inode->i_generation, (int __user *)arg);
}
-static int f2fs_ioc_start_atomic_write(struct file *filp)
+static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
{
struct inode *inode = file_inode(filp);
struct user_namespace *mnt_userns = file_mnt_user_ns(filp);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct inode *pinode;
+ loff_t isize;
int ret;
if (!inode_owner_or_capable(mnt_userns, inode))
@@ -2095,13 +2101,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
goto out;
}
- f2fs_i_size_write(fi->cow_inode, i_size_read(inode));
+
+ f2fs_write_inode(inode, NULL);
stat_inc_atomic_inode(inode);
set_inode_flag(inode, FI_ATOMIC_FILE);
set_inode_flag(fi->cow_inode, FI_COW_FILE);
clear_inode_flag(fi->cow_inode, FI_INLINE_DATA);
+
+ isize = i_size_read(inode);
+ fi->original_i_size = isize;
+ if (truncate) {
+ set_inode_flag(inode, FI_ATOMIC_REPLACE);
+ truncate_inode_pages_final(inode->i_mapping);
+ f2fs_i_size_write(inode, 0);
+ isize = 0;
+ }
+ f2fs_i_size_write(fi->cow_inode, isize);
+
f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
f2fs_update_time(sbi, REQ_TIME);
@@ -2133,16 +2151,14 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
if (f2fs_is_atomic_file(inode)) {
ret = f2fs_commit_atomic_write(inode);
- if (ret)
- goto unlock_out;
-
- ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
if (!ret)
- f2fs_abort_atomic_write(inode, false);
+ ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true);
+
+ f2fs_abort_atomic_write(inode, ret);
} else {
ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false);
}
-unlock_out:
+
inode_unlock(inode);
mnt_drop_write_file(filp);
return ret;
@@ -2543,7 +2559,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
struct f2fs_map_blocks map = { .m_next_extent = NULL,
.m_seg_type = NO_CHECK_TYPE,
.m_may_create = false };
- struct extent_info ei = {0, 0, 0};
+ struct extent_info ei = {};
pgoff_t pg_start, pg_end, next_pgofs;
unsigned int blk_per_seg = sbi->blocks_per_seg;
unsigned int total = 0, sec_num;
@@ -2575,7 +2591,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
* lookup mapping info in extent cache, skip defragmenting if physical
* block addresses are continuous.
*/
- if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, pg_start, &ei)) {
if (ei.fofs + ei.len >= pg_end)
goto out;
}
@@ -4131,7 +4147,9 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FS_IOC_GETVERSION:
return f2fs_ioc_getversion(filp, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
- return f2fs_ioc_start_atomic_write(filp);
+ return f2fs_ioc_start_atomic_write(filp, false);
+ case F2FS_IOC_START_ATOMIC_REPLACE:
+ return f2fs_ioc_start_atomic_write(filp, true);
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
return f2fs_ioc_commit_atomic_write(filp);
case F2FS_IOC_ABORT_ATOMIC_WRITE:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 536d332d9e2e..6e2cae3d2e71 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -96,16 +96,6 @@ static int gc_thread_func(void *data)
* invalidated soon after by user update or deletion.
* So, I'd like to wait some time to collect dirty segments.
*/
- if (sbi->gc_mode == GC_URGENT_HIGH) {
- spin_lock(&sbi->gc_urgent_high_lock);
- if (sbi->gc_urgent_high_remaining) {
- sbi->gc_urgent_high_remaining--;
- if (!sbi->gc_urgent_high_remaining)
- sbi->gc_mode = GC_NORMAL;
- }
- spin_unlock(&sbi->gc_urgent_high_lock);
- }
-
if (sbi->gc_mode == GC_URGENT_HIGH ||
sbi->gc_mode == GC_URGENT_MID) {
wait_ms = gc_th->urgent_sleep_time;
@@ -151,6 +141,10 @@ do_gc:
/* don't bother wait_ms by foreground gc */
if (!foreground)
wait_ms = gc_th->no_gc_sleep_time;
+ } else {
+ /* reset wait_ms to default sleep time */
+ if (wait_ms == gc_th->no_gc_sleep_time)
+ wait_ms = gc_th->min_sleep_time;
}
if (foreground)
@@ -162,6 +156,15 @@ do_gc:
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi, true);
next:
+ if (sbi->gc_mode != GC_NORMAL) {
+ spin_lock(&sbi->gc_remaining_trials_lock);
+ if (sbi->gc_remaining_trials) {
+ sbi->gc_remaining_trials--;
+ if (!sbi->gc_remaining_trials)
+ sbi->gc_mode = GC_NORMAL;
+ }
+ spin_unlock(&sbi->gc_remaining_trials_lock);
+ }
sb_end_write(sbi->sb);
} while (!kthread_should_stop());
@@ -172,13 +175,10 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
{
struct f2fs_gc_kthread *gc_th;
dev_t dev = sbi->sb->s_bdev->bd_dev;
- int err = 0;
gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
- if (!gc_th) {
- err = -ENOMEM;
- goto out;
- }
+ if (!gc_th)
+ return -ENOMEM;
gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
@@ -193,12 +193,14 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
"f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(gc_th->f2fs_gc_task)) {
- err = PTR_ERR(gc_th->f2fs_gc_task);
+ int err = PTR_ERR(gc_th->f2fs_gc_task);
+
kfree(gc_th);
sbi->gc_thread = NULL;
+ return err;
}
-out:
- return err;
+
+ return 0;
}
void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
@@ -1079,7 +1081,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
{
struct page *node_page;
nid_t nid;
- unsigned int ofs_in_node, max_addrs;
+ unsigned int ofs_in_node, max_addrs, base;
block_t source_blkaddr;
nid = le32_to_cpu(sum->nid);
@@ -1105,11 +1107,18 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return false;
}
- max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE :
- DEF_ADDRS_PER_BLOCK;
- if (ofs_in_node >= max_addrs) {
- f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
- ofs_in_node, dni->ino, dni->nid, max_addrs);
+ if (IS_INODE(node_page)) {
+ base = offset_in_addr(F2FS_INODE(node_page));
+ max_addrs = DEF_ADDRS_PER_INODE;
+ } else {
+ base = 0;
+ max_addrs = DEF_ADDRS_PER_BLOCK;
+ }
+
+ if (base + ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u",
+ base, ofs_in_node, max_addrs, dni->ino, dni->nid);
+ f2fs_put_page(node_page, 1);
return false;
}
@@ -1141,7 +1150,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
struct page *page;
- struct extent_info ei = {0, 0, 0};
+ struct extent_info ei = {0, };
struct f2fs_io_info fio = {
.sbi = sbi,
.ino = inode->i_ino,
@@ -1159,7 +1168,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
if (!page)
return -ENOMEM;
- if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+ if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
dn.data_blkaddr = ei.blk + index - ei.fofs;
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ))) {
@@ -1563,8 +1572,8 @@ next_step:
continue;
}
- data_page = f2fs_get_read_data_page(inode,
- start_bidx, REQ_RAHEAD, true);
+ data_page = f2fs_get_read_data_page(inode, start_bidx,
+ REQ_RAHEAD, true, NULL);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
if (IS_ERR(data_page)) {
iput(inode);
@@ -1744,8 +1753,9 @@ freed:
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
- if (__is_large_section(sbi) && segno + 1 < end_segno)
- sbi->next_victim_seg[gc_type] = segno + 1;
+ if (__is_large_section(sbi))
+ sbi->next_victim_seg[gc_type] =
+ (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO;
skip:
f2fs_put_page(sum_page, 0);
}
@@ -1898,9 +1908,7 @@ int __init f2fs_create_garbage_collection_cache(void)
{
victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry",
sizeof(struct victim_entry));
- if (!victim_entry_slab)
- return -ENOMEM;
- return 0;
+ return victim_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_garbage_collection_cache(void)
@@ -2133,8 +2141,6 @@ out_unlock:
if (err)
return err;
- set_sbi_flag(sbi, SBI_IS_RESIZEFS);
-
freeze_super(sbi->sb);
f2fs_down_write(&sbi->gc_lock);
f2fs_down_write(&sbi->cp_global_sem);
@@ -2150,6 +2156,7 @@ out_unlock:
if (err)
goto out_err;
+ set_sbi_flag(sbi, SBI_IS_RESIZEFS);
err = free_segment_range(sbi, secs, false);
if (err)
goto recover_out;
@@ -2173,6 +2180,7 @@ out_unlock:
f2fs_commit_super(sbi, false);
}
recover_out:
+ clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
@@ -2185,6 +2193,5 @@ out_err:
f2fs_up_write(&sbi->cp_global_sem);
f2fs_up_write(&sbi->gc_lock);
thaw_super(sbi->sb);
- clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
return err;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9f0d3864d9f1..ff6cf66ed46b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -262,8 +262,8 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
return false;
}
- if (fi->extent_tree) {
- struct extent_info *ei = &fi->extent_tree->largest;
+ if (fi->extent_tree[EX_READ]) {
+ struct extent_info *ei = &fi->extent_tree[EX_READ]->largest;
if (ei->len &&
(!f2fs_is_valid_blkaddr(sbi, ei->blk,
@@ -392,8 +392,6 @@ static int do_read_inode(struct inode *inode)
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
- f2fs_init_extent_tree(inode, node_page);
-
get_inline_info(inode, ri);
fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
@@ -479,6 +477,11 @@ static int do_read_inode(struct inode *inode)
}
init_idisk_time(inode);
+
+ /* Need all the flag bits */
+ f2fs_init_read_extent_tree(inode, node_page);
+ f2fs_init_age_extent_tree(inode);
+
f2fs_put_page(node_page, 1);
stat_inc_inline_xattr(inode);
@@ -607,7 +610,7 @@ retry:
void f2fs_update_inode(struct inode *inode, struct page *node_page)
{
struct f2fs_inode *ri;
- struct extent_tree *et = F2FS_I(inode)->extent_tree;
+ struct extent_tree *et = F2FS_I(inode)->extent_tree[EX_READ];
f2fs_wait_on_page_writeback(node_page, NODE, true, true);
set_page_dirty(node_page);
@@ -621,12 +624,15 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
ri->i_uid = cpu_to_le32(i_uid_read(inode));
ri->i_gid = cpu_to_le32(i_gid_read(inode));
ri->i_links = cpu_to_le32(inode->i_nlink);
- ri->i_size = cpu_to_le64(i_size_read(inode));
ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1);
+ if (!f2fs_is_atomic_file(inode) ||
+ is_inode_flag_set(inode, FI_ATOMIC_COMMITTED))
+ ri->i_size = cpu_to_le64(i_size_read(inode));
+
if (et) {
read_lock(&et->lock);
- set_raw_extent(&et->largest, &ri->i_ext);
+ set_raw_read_extent(&et->largest, &ri->i_ext);
read_unlock(&et->lock);
} else {
memset(&ri->i_ext, 0, sizeof(ri->i_ext));
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index c227113b0f26..6032589099ce 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -22,137 +22,6 @@
#include "acl.h"
#include <trace/events/f2fs.h>
-static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
- struct inode *dir, umode_t mode)
-{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- nid_t ino;
- struct inode *inode;
- bool nid_free = false;
- bool encrypt = false;
- int xattr_size = 0;
- int err;
-
- inode = new_inode(dir->i_sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- if (!f2fs_alloc_nid(sbi, &ino)) {
- err = -ENOSPC;
- goto fail;
- }
-
- nid_free = true;
-
- inode_init_owner(mnt_userns, inode, dir, mode);
-
- inode->i_ino = ino;
- inode->i_blocks = 0;
- inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
- F2FS_I(inode)->i_crtime = inode->i_mtime;
- inode->i_generation = get_random_u32();
-
- if (S_ISDIR(inode->i_mode))
- F2FS_I(inode)->i_current_depth = 1;
-
- err = insert_inode_locked(inode);
- if (err) {
- err = -EINVAL;
- goto fail;
- }
-
- if (f2fs_sb_has_project_quota(sbi) &&
- (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
- F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
- else
- F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
- F2FS_DEF_PROJID);
-
- err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
- if (err)
- goto fail_drop;
-
- err = f2fs_dquot_initialize(inode);
- if (err)
- goto fail_drop;
-
- set_inode_flag(inode, FI_NEW_INODE);
-
- if (encrypt)
- f2fs_set_encrypted_inode(inode);
-
- if (f2fs_sb_has_extra_attr(sbi)) {
- set_inode_flag(inode, FI_EXTRA_ATTR);
- F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
- }
-
- if (test_opt(sbi, INLINE_XATTR))
- set_inode_flag(inode, FI_INLINE_XATTR);
-
- if (f2fs_may_inline_dentry(inode))
- set_inode_flag(inode, FI_INLINE_DENTRY);
-
- if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
- f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
- if (f2fs_has_inline_xattr(inode))
- xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
- /* Otherwise, will be 0 */
- } else if (f2fs_has_inline_xattr(inode) ||
- f2fs_has_inline_dentry(inode)) {
- xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- }
- F2FS_I(inode)->i_inline_xattr_size = xattr_size;
-
- f2fs_init_extent_tree(inode, NULL);
-
- F2FS_I(inode)->i_flags =
- f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
-
- if (S_ISDIR(inode->i_mode))
- F2FS_I(inode)->i_flags |= F2FS_INDEX_FL;
-
- if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
- set_inode_flag(inode, FI_PROJ_INHERIT);
-
- if (f2fs_sb_has_compression(sbi)) {
- /* Inherit the compression flag in directory */
- if ((F2FS_I(dir)->i_flags & F2FS_COMPR_FL) &&
- f2fs_may_compress(inode))
- set_compress_context(inode);
- }
-
- /* Should enable inline_data after compression set */
- if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(inode, FI_INLINE_DATA);
-
- stat_inc_inline_xattr(inode);
- stat_inc_inline_inode(inode);
- stat_inc_inline_dir(inode);
-
- f2fs_set_inode_flags(inode);
-
- trace_f2fs_new_inode(inode, 0);
- return inode;
-
-fail:
- trace_f2fs_new_inode(inode, err);
- make_bad_inode(inode);
- if (nid_free)
- set_inode_flag(inode, FI_FREE_NID);
- iput(inode);
- return ERR_PTR(err);
-fail_drop:
- trace_f2fs_new_inode(inode, err);
- dquot_drop(inode);
- inode->i_flags |= S_NOQUOTA;
- if (nid_free)
- set_inode_flag(inode, FI_FREE_NID);
- clear_nlink(inode);
- unlock_new_inode(inode);
- iput(inode);
- return ERR_PTR(err);
-}
-
static inline int is_extension_exist(const unsigned char *s, const char *sub,
bool tmp_ext)
{
@@ -187,36 +56,6 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub,
return 0;
}
-/*
- * Set file's temperature for hot/cold data separation
- */
-static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
- const unsigned char *name)
-{
- __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
- int i, cold_count, hot_count;
-
- f2fs_down_read(&sbi->sb_lock);
-
- cold_count = le32_to_cpu(sbi->raw_super->extension_count);
- hot_count = sbi->raw_super->hot_ext_count;
-
- for (i = 0; i < cold_count + hot_count; i++) {
- if (is_extension_exist(name, extlist[i], true))
- break;
- }
-
- f2fs_up_read(&sbi->sb_lock);
-
- if (i == cold_count + hot_count)
- return;
-
- if (i < cold_count)
- file_set_cold(inode);
- else
- file_set_hot(inode);
-}
-
int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
bool hot, bool set)
{
@@ -283,56 +122,215 @@ int f2fs_update_extension_list(struct f2fs_sb_info *sbi, const char *name,
return 0;
}
-static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode,
- const unsigned char *name)
+static void set_compress_new_inode(struct f2fs_sb_info *sbi, struct inode *dir,
+ struct inode *inode, const unsigned char *name)
{
__u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
- unsigned char (*noext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).noextensions;
+ unsigned char (*noext)[F2FS_EXTENSION_LEN] =
+ F2FS_OPTION(sbi).noextensions;
unsigned char (*ext)[F2FS_EXTENSION_LEN] = F2FS_OPTION(sbi).extensions;
unsigned char ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
unsigned char noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
int i, cold_count, hot_count;
- if (!f2fs_sb_has_compression(sbi) ||
- F2FS_I(inode)->i_flags & F2FS_NOCOMP_FL ||
- !f2fs_may_compress(inode) ||
- (!ext_cnt && !noext_cnt))
+ if (!f2fs_sb_has_compression(sbi))
return;
- f2fs_down_read(&sbi->sb_lock);
+ if (S_ISDIR(inode->i_mode))
+ goto inherit_comp;
+ /* This name comes only from normal files. */
+ if (!name)
+ return;
+
+ /* Don't compress hot files. */
+ f2fs_down_read(&sbi->sb_lock);
cold_count = le32_to_cpu(sbi->raw_super->extension_count);
hot_count = sbi->raw_super->hot_ext_count;
+ for (i = cold_count; i < cold_count + hot_count; i++)
+ if (is_extension_exist(name, extlist[i], false))
+ break;
+ f2fs_up_read(&sbi->sb_lock);
+ if (i < (cold_count + hot_count))
+ return;
+
+ /* Don't compress unallowed extension. */
+ for (i = 0; i < noext_cnt; i++)
+ if (is_extension_exist(name, noext[i], false))
+ return;
- for (i = cold_count; i < cold_count + hot_count; i++) {
- if (is_extension_exist(name, extlist[i], false)) {
- f2fs_up_read(&sbi->sb_lock);
+ /* Compress wanting extension. */
+ for (i = 0; i < ext_cnt; i++) {
+ if (is_extension_exist(name, ext[i], false)) {
+ set_compress_context(inode);
return;
}
}
+inherit_comp:
+ /* Inherit the {no-}compression flag in directory */
+ if (F2FS_I(dir)->i_flags & F2FS_NOCOMP_FL) {
+ F2FS_I(inode)->i_flags |= F2FS_NOCOMP_FL;
+ f2fs_mark_inode_dirty_sync(inode, true);
+ } else if (F2FS_I(dir)->i_flags & F2FS_COMPR_FL) {
+ set_compress_context(inode);
+ }
+}
+
+/*
+ * Set file's temperature for hot/cold data separation
+ */
+static void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
+ const unsigned char *name)
+{
+ __u8 (*extlist)[F2FS_EXTENSION_LEN] = sbi->raw_super->extension_list;
+ int i, cold_count, hot_count;
+ f2fs_down_read(&sbi->sb_lock);
+ cold_count = le32_to_cpu(sbi->raw_super->extension_count);
+ hot_count = sbi->raw_super->hot_ext_count;
+ for (i = 0; i < cold_count + hot_count; i++)
+ if (is_extension_exist(name, extlist[i], true))
+ break;
f2fs_up_read(&sbi->sb_lock);
- for (i = 0; i < noext_cnt; i++) {
- if (is_extension_exist(name, noext[i], false)) {
- f2fs_disable_compressed_file(inode);
- return;
- }
+ if (i == cold_count + hot_count)
+ return;
+
+ if (i < cold_count)
+ file_set_cold(inode);
+ else
+ file_set_hot(inode);
+}
+
+static struct inode *f2fs_new_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, umode_t mode,
+ const char *name)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
+ nid_t ino;
+ struct inode *inode;
+ bool nid_free = false;
+ bool encrypt = false;
+ int xattr_size = 0;
+ int err;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!f2fs_alloc_nid(sbi, &ino)) {
+ err = -ENOSPC;
+ goto fail;
}
- if (is_inode_flag_set(inode, FI_COMPRESSED_FILE))
- return;
+ nid_free = true;
- for (i = 0; i < ext_cnt; i++) {
- if (!is_extension_exist(name, ext[i], false))
- continue;
+ inode_init_owner(mnt_userns, inode, dir, mode);
- /* Do not use inline_data with compression */
- stat_dec_inline_inode(inode);
- clear_inode_flag(inode, FI_INLINE_DATA);
- set_compress_context(inode);
- return;
+ inode->i_ino = ino;
+ inode->i_blocks = 0;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
+ F2FS_I(inode)->i_crtime = inode->i_mtime;
+ inode->i_generation = get_random_u32();
+
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_current_depth = 1;
+
+ err = insert_inode_locked(inode);
+ if (err) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (f2fs_sb_has_project_quota(sbi) &&
+ (F2FS_I(dir)->i_flags & F2FS_PROJINHERIT_FL))
+ F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid;
+ else
+ F2FS_I(inode)->i_projid = make_kprojid(mnt_userns,
+ F2FS_DEF_PROJID);
+
+ err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
+ if (err)
+ goto fail_drop;
+
+ err = f2fs_dquot_initialize(inode);
+ if (err)
+ goto fail_drop;
+
+ set_inode_flag(inode, FI_NEW_INODE);
+
+ if (encrypt)
+ f2fs_set_encrypted_inode(inode);
+
+ if (f2fs_sb_has_extra_attr(sbi)) {
+ set_inode_flag(inode, FI_EXTRA_ATTR);
+ F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE;
+ }
+
+ if (test_opt(sbi, INLINE_XATTR))
+ set_inode_flag(inode, FI_INLINE_XATTR);
+
+ if (f2fs_may_inline_dentry(inode))
+ set_inode_flag(inode, FI_INLINE_DENTRY);
+
+ if (f2fs_sb_has_flexible_inline_xattr(sbi)) {
+ f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+ if (f2fs_has_inline_xattr(inode))
+ xattr_size = F2FS_OPTION(sbi).inline_xattr_size;
+ /* Otherwise, will be 0 */
+ } else if (f2fs_has_inline_xattr(inode) ||
+ f2fs_has_inline_dentry(inode)) {
+ xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
}
+ F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
+ F2FS_I(inode)->i_flags =
+ f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
+
+ if (S_ISDIR(inode->i_mode))
+ F2FS_I(inode)->i_flags |= F2FS_INDEX_FL;
+
+ if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL)
+ set_inode_flag(inode, FI_PROJ_INHERIT);
+
+ /* Check compression first. */
+ set_compress_new_inode(sbi, dir, inode, name);
+
+ /* Should enable inline_data after compression set */
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+
+ if (name && !test_opt(sbi, DISABLE_EXT_IDENTIFY))
+ set_file_temperature(sbi, inode, name);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
+ f2fs_set_inode_flags(inode);
+
+ f2fs_init_extent_tree(inode);
+
+ trace_f2fs_new_inode(inode, 0);
+ return inode;
+
+fail:
+ trace_f2fs_new_inode(inode, err);
+ make_bad_inode(inode);
+ if (nid_free)
+ set_inode_flag(inode, FI_FREE_NID);
+ iput(inode);
+ return ERR_PTR(err);
+fail_drop:
+ trace_f2fs_new_inode(inode, err);
+ dquot_drop(inode);
+ inode->i_flags |= S_NOQUOTA;
+ if (nid_free)
+ set_inode_flag(inode, FI_FREE_NID);
+ clear_nlink(inode);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
}
static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
@@ -352,15 +350,10 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, dentry->d_name.name);
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
- set_file_temperature(sbi, inode, dentry->d_name.name);
-
- set_compress_inode(sbi, inode, dentry->d_name.name);
-
inode->i_op = &f2fs_file_inode_operations;
inode->i_fop = &f2fs_file_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
@@ -632,6 +625,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
goto fail;
}
f2fs_delete_entry(de, page, dir, inode);
+ f2fs_unlock_op(sbi);
+
#if IS_ENABLED(CONFIG_UNICODE)
/* VFS negative dentries are incompatible with Encoding and
* Case-insensitiveness. Eventually we'll want avoid
@@ -642,8 +637,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (IS_CASEFOLDED(dir))
d_invalidate(dentry);
#endif
- f2fs_unlock_op(sbi);
-
if (IS_DIRSYNC(dir))
f2fs_sync_fs(sbi->sb, 1);
fail:
@@ -689,7 +682,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFLNK | S_IRWXUGO, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -760,7 +753,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode);
+ inode = f2fs_new_inode(mnt_userns, dir, S_IFDIR | mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -817,7 +810,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -856,7 +849,7 @@ static int __f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
if (err)
return err;
- inode = f2fs_new_inode(mnt_userns, dir, mode);
+ inode = f2fs_new_inode(mnt_userns, dir, mode, NULL);
if (IS_ERR(inode))
return PTR_ERR(inode);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 983572f23896..dde4c0458704 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -60,7 +60,7 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
avail_ram = val.totalram - val.totalhigh;
/*
- * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
+ * give 25%, 25%, 50%, 50%, 25%, 25% memory for each components respectively
*/
if (type == FREE_NIDS) {
mem_size = (nm_i->nid_cnt[FREE_NID] *
@@ -85,12 +85,16 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
sizeof(struct ino_entry);
mem_size >>= PAGE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
- } else if (type == EXTENT_CACHE) {
- mem_size = (atomic_read(&sbi->total_ext_tree) *
+ } else if (type == READ_EXTENT_CACHE || type == AGE_EXTENT_CACHE) {
+ enum extent_type etype = type == READ_EXTENT_CACHE ?
+ EX_READ : EX_BLOCK_AGE;
+ struct extent_tree_info *eti = &sbi->extent_tree[etype];
+
+ mem_size = (atomic_read(&eti->total_ext_tree) *
sizeof(struct extent_tree) +
- atomic_read(&sbi->total_ext_node) *
+ atomic_read(&eti->total_ext_node) *
sizeof(struct extent_node)) >> PAGE_SHIFT;
- res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
} else if (type == DISCARD_CACHE) {
mem_size = (atomic_read(&dcc->discard_cmd_cnt) *
sizeof(struct discard_cmd)) >> PAGE_SHIFT;
@@ -859,7 +863,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
blkaddr = data_blkaddr(dn->inode, dn->node_page,
dn->ofs_in_node + 1);
- f2fs_update_extent_tree_range_compressed(dn->inode,
+ f2fs_update_read_extent_tree_range_compressed(dn->inode,
index, blkaddr,
F2FS_I(dn->inode)->i_cluster_size,
c_len);
@@ -1360,8 +1364,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
return err;
/* NEW_ADDR can be seen, after cp_error drops some dirty node pages */
- if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) ||
- is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) {
+ if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR)) {
ClearPageUptodate(page);
return -ENOENT;
}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3c09cae058b0..99454d46a939 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -146,7 +146,8 @@ enum mem_type {
NAT_ENTRIES, /* indicates the cached nat entry */
DIRTY_DENTS, /* indicates dirty dentry pages */
INO_ENTRIES, /* indicates inode entries */
- EXTENT_CACHE, /* indicates extent cache */
+ READ_EXTENT_CACHE, /* indicates read extent cache */
+ AGE_EXTENT_CACHE, /* indicates age extent cache */
DISCARD_CACHE, /* indicates memory of cached discard cmds */
COMPRESS_PAGE, /* indicates memory of cached compressed pages */
BASE_CHECK, /* check kernel status */
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index dea95b48b647..77fd453949b1 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -923,9 +923,7 @@ int __init f2fs_create_recovery_cache(void)
{
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
sizeof(struct fsync_inode_entry));
- if (!fsync_entry_slab)
- return -ENOMEM;
- return 0;
+ return fsync_entry_slab ? 0 : -ENOMEM;
}
void f2fs_destroy_recovery_cache(void)
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index b304692c0cf5..ae3c4e5474ef 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,14 +192,19 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
if (!f2fs_is_atomic_file(inode))
return;
- if (clean)
- truncate_inode_pages_final(inode->i_mapping);
clear_inode_flag(fi->cow_inode, FI_COW_FILE);
iput(fi->cow_inode);
fi->cow_inode = NULL;
release_atomic_write_cnt(inode);
+ clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
+ clear_inode_flag(inode, FI_ATOMIC_REPLACE);
clear_inode_flag(inode, FI_ATOMIC_FILE);
stat_dec_atomic_inode(inode);
+
+ if (clean) {
+ truncate_inode_pages_final(inode->i_mapping);
+ f2fs_i_size_write(inode, fi->original_i_size);
+ }
}
static int __replace_atomic_write_block(struct inode *inode, pgoff_t index,
@@ -257,14 +262,19 @@ static void __complete_revoke_list(struct inode *inode, struct list_head *head,
bool revoke)
{
struct revoke_entry *cur, *tmp;
+ bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE);
list_for_each_entry_safe(cur, tmp, head, list) {
if (revoke)
__replace_atomic_write_block(inode, cur->index,
cur->old_addr, NULL, true);
+
list_del(&cur->list);
kmem_cache_free(revoke_entry_slab, cur);
}
+
+ if (!revoke && truncate)
+ f2fs_do_truncate_blocks(inode, 0, false);
}
static int __f2fs_commit_atomic_write(struct inode *inode)
@@ -335,10 +345,12 @@ next:
}
out:
- if (ret)
+ if (ret) {
sbi->revoked_atomic_block += fi->atomic_write_cnt;
- else
+ } else {
sbi->committed_atomic_block += fi->atomic_write_cnt;
+ set_inode_flag(inode, FI_ATOMIC_COMMITTED);
+ }
__complete_revoke_list(inode, &revoke_list, ret ? true : false);
@@ -437,8 +449,14 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
return;
/* try to shrink extent cache when there is no enough memory */
- if (!f2fs_available_free_memory(sbi, EXTENT_CACHE))
- f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
+ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE))
+ f2fs_shrink_read_extent_tree(sbi,
+ READ_EXTENT_CACHE_SHRINK_NUMBER);
+
+ /* try to shrink age extent cache when there is no enough memory */
+ if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE))
+ f2fs_shrink_age_extent_tree(sbi,
+ AGE_EXTENT_CACHE_SHRINK_NUMBER);
/* check the # of cached NAT entries */
if (!f2fs_available_free_memory(sbi, NAT_ENTRIES))
@@ -620,12 +638,11 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
{
dev_t dev = sbi->sb->s_bdev->bd_dev;
struct flush_cmd_control *fcc;
- int err = 0;
if (SM_I(sbi)->fcc_info) {
fcc = SM_I(sbi)->fcc_info;
if (fcc->f2fs_issue_flush)
- return err;
+ return 0;
goto init_thread;
}
@@ -638,19 +655,19 @@ int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi)
init_llist_head(&fcc->issue_list);
SM_I(sbi)->fcc_info = fcc;
if (!test_opt(sbi, FLUSH_MERGE))
- return err;
+ return 0;
init_thread:
fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
if (IS_ERR(fcc->f2fs_issue_flush)) {
- err = PTR_ERR(fcc->f2fs_issue_flush);
- kfree(fcc);
- SM_I(sbi)->fcc_info = NULL;
+ int err = PTR_ERR(fcc->f2fs_issue_flush);
+
+ fcc->f2fs_issue_flush = NULL;
return err;
}
- return err;
+ return 0;
}
void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
@@ -856,7 +873,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
}
mutex_unlock(&dirty_i->seglist_lock);
- unusable = holes[DATA] > holes[NODE] ? holes[DATA] : holes[NODE];
+ unusable = max(holes[DATA], holes[NODE]);
if (unusable > ovp_holes)
return unusable - ovp_holes;
return 0;
@@ -1052,8 +1069,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->io_aware = true;
dpolicy->sync = false;
dpolicy->ordered = true;
- if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) {
- dpolicy->granularity = 1;
+ if (utilization(sbi) > dcc->discard_urgent_util) {
+ dpolicy->granularity = MIN_DISCARD_GRANULARITY;
if (atomic_read(&dcc->discard_cmd_cnt))
dpolicy->max_interval =
dcc->min_discard_issue_time;
@@ -1068,7 +1085,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
} else if (discard_type == DPOLICY_UMOUNT) {
dpolicy->io_aware = false;
/* we need to issue all to keep CP_TRIMMED_FLAG */
- dpolicy->granularity = 1;
+ dpolicy->granularity = MIN_DISCARD_GRANULARITY;
dpolicy->timeout = true;
}
}
@@ -1126,13 +1143,12 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
if (time_to_inject(sbi, FAULT_DISCARD)) {
f2fs_show_injection_info(sbi, FAULT_DISCARD);
err = -EIO;
- goto submit;
- }
- err = __blkdev_issue_discard(bdev,
+ } else {
+ err = __blkdev_issue_discard(bdev,
SECTOR_FROM_BLOCK(start),
SECTOR_FROM_BLOCK(len),
GFP_NOFS, &bio);
-submit:
+ }
if (err) {
spin_lock_irqsave(&dc->lock, flags);
if (dc->state == D_PARTIAL)
@@ -1170,7 +1186,7 @@ submit:
atomic_inc(&dcc->issued_discard);
- f2fs_update_iostat(sbi, NULL, FS_DISCARD, 1);
+ f2fs_update_iostat(sbi, NULL, FS_DISCARD, len * F2FS_BLKSIZE);
lstart += len;
start += len;
@@ -1342,13 +1358,13 @@ static void __update_discard_tree_range(struct f2fs_sb_info *sbi,
}
}
-static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
+static void __queue_discard_cmd(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkstart, block_t blklen)
{
block_t lblkstart = blkstart;
if (!f2fs_bdev_support_discard(bdev))
- return 0;
+ return;
trace_f2fs_queue_discard(bdev, blkstart, blklen);
@@ -1360,7 +1376,6 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock);
__update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen);
mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock);
- return 0;
}
static unsigned int __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi,
@@ -1448,7 +1463,7 @@ retry:
if (i + 1 < dpolicy->granularity)
break;
- if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+ if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered)
return __issue_discard_cmd_orderly(sbi, dpolicy);
pend_list = &dcc->pend_list[i];
@@ -1645,6 +1660,9 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
struct discard_policy dpolicy;
bool dropped;
+ if (!atomic_read(&dcc->discard_cmd_cnt))
+ return false;
+
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
__issue_discard_cmd(sbi, &dpolicy);
@@ -1669,6 +1687,11 @@ static int issue_discard_thread(void *data)
set_freezable();
do {
+ wait_event_interruptible_timeout(*q,
+ kthread_should_stop() || freezing(current) ||
+ dcc->discard_wake,
+ msecs_to_jiffies(wait_ms));
+
if (sbi->gc_mode == GC_URGENT_HIGH ||
!f2fs_available_free_memory(sbi, DISCARD_CACHE))
__init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1);
@@ -1676,14 +1699,6 @@ static int issue_discard_thread(void *data)
__init_discard_policy(sbi, &dpolicy, DPOLICY_BG,
dcc->discard_granularity);
- if (!atomic_read(&dcc->discard_cmd_cnt))
- wait_ms = dpolicy.max_interval;
-
- wait_event_interruptible_timeout(*q,
- kthread_should_stop() || freezing(current) ||
- dcc->discard_wake,
- msecs_to_jiffies(wait_ms));
-
if (dcc->discard_wake)
dcc->discard_wake = 0;
@@ -1697,12 +1712,11 @@ static int issue_discard_thread(void *data)
continue;
if (kthread_should_stop())
return 0;
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
+ !atomic_read(&dcc->discard_cmd_cnt)) {
wait_ms = dpolicy.max_interval;
continue;
}
- if (!atomic_read(&dcc->discard_cmd_cnt))
- continue;
sb_start_intwrite(sbi->sb);
@@ -1717,6 +1731,8 @@ static int issue_discard_thread(void *data)
} else {
wait_ms = dpolicy.max_interval;
}
+ if (!atomic_read(&dcc->discard_cmd_cnt))
+ wait_ms = dpolicy.max_interval;
sb_end_intwrite(sbi->sb);
@@ -1760,7 +1776,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
}
/* For conventional zones, use regular discard if supported */
- return __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+ __queue_discard_cmd(sbi, bdev, lblkstart, blklen);
+ return 0;
}
#endif
@@ -1771,7 +1788,8 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi,
if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev))
return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
#endif
- return __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+ __queue_discard_cmd(sbi, bdev, blkstart, blklen);
+ return 0;
}
static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
@@ -2025,8 +2043,10 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi,
"f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev));
- if (IS_ERR(dcc->f2fs_issue_discard))
+ if (IS_ERR(dcc->f2fs_issue_discard)) {
err = PTR_ERR(dcc->f2fs_issue_discard);
+ dcc->f2fs_issue_discard = NULL;
+ }
return err;
}
@@ -2046,6 +2066,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
return -ENOMEM;
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
+ dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
dcc->discard_granularity = sbi->blocks_per_seg;
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
@@ -2066,6 +2087,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME;
+ dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL;
dcc->undiscard_blks = 0;
dcc->next_pos = 0;
dcc->root = RB_ROOT_CACHED;
@@ -2096,8 +2118,7 @@ static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi)
* Recovery can cache discard commands, so in error path of
* fill_super(), it needs to give a chance to handle them.
*/
- if (unlikely(atomic_read(&dcc->discard_cmd_cnt)))
- f2fs_issue_discard_timeout(sbi);
+ f2fs_issue_discard_timeout(sbi);
kfree(dcc);
SM_I(sbi)->dcc_info = NULL;
@@ -2642,7 +2663,7 @@ bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
* This function always allocates a used segment(from dirty seglist) by SSR
* manner, so it should recover the existing segment information of valid blocks
*/
-static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
+static void change_curseg(struct f2fs_sb_info *sbi, int type)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2650,9 +2671,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush)
struct f2fs_summary_block *sum_node;
struct page *sum_page;
- if (flush)
- write_sum_page(sbi, curseg->sum_blk,
- GET_SUM_BLOCK(sbi, curseg->segno));
+ write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
__set_test_and_inuse(sbi, new_segno);
@@ -2691,7 +2710,7 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
curseg->seg_type = se->type;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
} else {
/* allocate cold segment by default */
curseg->seg_type = CURSEG_COLD_DATA;
@@ -2835,31 +2854,20 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
return 0;
}
-/*
- * flush out current segment and replace it with new segment
- * This function should be returned with success, otherwise BUG
- */
-static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
- int type, bool force)
+static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- if (force)
- new_curseg(sbi, type, true);
- else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
- curseg->seg_type == CURSEG_WARM_NODE)
- new_curseg(sbi, type, false);
- else if (curseg->alloc_type == LFS &&
- is_next_segment_free(sbi, curseg, type) &&
- likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
- new_curseg(sbi, type, false);
- else if (f2fs_need_SSR(sbi) &&
- get_ssr_segment(sbi, type, SSR, 0))
- change_curseg(sbi, type, true);
- else
- new_curseg(sbi, type, false);
-
- stat_inc_seg_type(sbi, curseg);
+ if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) &&
+ curseg->seg_type == CURSEG_WARM_NODE)
+ return true;
+ if (curseg->alloc_type == LFS &&
+ is_next_segment_free(sbi, curseg, type) &&
+ likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
+ return true;
+ if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0))
+ return true;
+ return false;
}
void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
@@ -2877,7 +2885,7 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
goto unlock;
if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
else
new_curseg(sbi, type, true);
@@ -2912,7 +2920,8 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
return;
alloc:
old_segno = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+ new_curseg(sbi, type, true);
+ stat_inc_seg_type(sbi, curseg);
locate_dirty_segment(sbi, old_segno);
}
@@ -2943,10 +2952,6 @@ void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
-static const struct segment_allocation default_salloc_ops = {
- .allocate_segment = allocate_segment_by_default,
-};
-
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc)
{
@@ -3152,10 +3157,28 @@ static int __get_segment_type_4(struct f2fs_io_info *fio)
}
}
+static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct extent_info ei = {};
+
+ if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) {
+ if (!ei.age)
+ return NO_CHECK_TYPE;
+ if (ei.age <= sbi->hot_data_age_threshold)
+ return CURSEG_HOT_DATA;
+ if (ei.age <= sbi->warm_data_age_threshold)
+ return CURSEG_WARM_DATA;
+ return CURSEG_COLD_DATA;
+ }
+ return NO_CHECK_TYPE;
+}
+
static int __get_segment_type_6(struct f2fs_io_info *fio)
{
if (fio->type == DATA) {
struct inode *inode = fio->page->mapping->host;
+ int type;
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
return CURSEG_COLD_DATA_PINNED;
@@ -3170,6 +3193,11 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
}
if (file_is_cold(inode) || f2fs_need_compress_data(inode))
return CURSEG_COLD_DATA;
+
+ type = __get_age_segment_type(inode, fio->page->index);
+ if (type != NO_CHECK_TYPE)
+ return type;
+
if (file_is_hot(inode) ||
is_inode_flag_set(inode, FI_HOT_DATA) ||
f2fs_is_cow_file(inode))
@@ -3266,11 +3294,19 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
update_sit_entry(sbi, old_blkaddr, -1);
if (!__has_curseg_space(sbi, curseg)) {
- if (from_gc)
+ /*
+ * Flush out current segment and replace it with new segment.
+ */
+ if (from_gc) {
get_atssr_segment(sbi, type, se->type,
AT_SSR, se->mtime);
- else
- sit_i->s_ops->allocate_segment(sbi, type, false);
+ } else {
+ if (need_new_seg(sbi, type))
+ new_curseg(sbi, type, false);
+ else
+ change_curseg(sbi, type);
+ stat_inc_seg_type(sbi, curseg);
+ }
}
/*
* segment dirty status should be updated after segment allocation,
@@ -3280,6 +3316,9 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
+ if (IS_DATASEG(type))
+ atomic64_inc(&sbi->allocated_data_blocks);
+
up_write(&sit_i->sentry_lock);
if (page && IS_NODESEG(type)) {
@@ -3407,6 +3446,8 @@ void f2fs_outplace_write_data(struct dnode_of_data *dn,
struct f2fs_summary sum;
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
+ if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO)
+ f2fs_update_age_extent_cache(dn);
set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version);
do_write_page(&sum, fio);
f2fs_update_data_blkaddr(dn, fio->new_blkaddr);
@@ -3531,7 +3572,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
/* change the current segment */
if (segno != curseg->segno) {
curseg->next_segno = segno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -3559,7 +3600,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (recover_curseg) {
if (old_cursegno != curseg->segno) {
curseg->next_segno = old_cursegno;
- change_curseg(sbi, type, true);
+ change_curseg(sbi, type);
}
curseg->next_blkoff = old_blkoff;
curseg->alloc_type = old_alloc_type;
@@ -4256,9 +4297,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
#endif
- /* init SIT information */
- sit_i->s_ops = &default_salloc_ops;
-
sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
sit_i->written_valid_blocks = 0;
@@ -5099,11 +5137,9 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
init_f2fs_rwsem(&sm_info->curseg_lock);
- if (!f2fs_readonly(sbi->sb)) {
- err = f2fs_create_flush_cmd_control(sbi);
- if (err)
- return err;
- }
+ err = f2fs_create_flush_cmd_control(sbi);
+ if (err)
+ return err;
err = create_discard_cmd_control(sbi);
if (err)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index be8f2d7d007b..3ad1b7b6fa94 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -222,10 +222,6 @@ struct sec_entry {
unsigned int valid_blocks; /* # of valid blocks in a section */
};
-struct segment_allocation {
- void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
-};
-
#define MAX_SKIP_GC_COUNT 16
struct revoke_entry {
@@ -235,8 +231,6 @@ struct revoke_entry {
};
struct sit_info {
- const struct segment_allocation *s_ops;
-
block_t sit_base_addr; /* start block address of SIT area */
block_t sit_blocks; /* # of blocks used by SIT area */
block_t written_valid_blocks; /* # of valid blocks in main area */
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index dd3c3c7a90ec..83d6fb97dcae 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,10 +28,13 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
return count > 0 ? count : 0;
}
-static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
+static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi,
+ enum extent_type type)
{
- return atomic_read(&sbi->total_zombie_tree) +
- atomic_read(&sbi->total_ext_node);
+ struct extent_tree_info *eti = &sbi->extent_tree[type];
+
+ return atomic_read(&eti->total_zombie_tree) +
+ atomic_read(&eti->total_ext_node);
}
unsigned long f2fs_shrink_count(struct shrinker *shrink,
@@ -53,8 +56,11 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
}
spin_unlock(&f2fs_list_lock);
- /* count extent cache entries */
- count += __count_extent_cache(sbi);
+ /* count read extent cache entries */
+ count += __count_extent_cache(sbi, EX_READ);
+
+ /* count block age extent cache entries */
+ count += __count_extent_cache(sbi, EX_BLOCK_AGE);
/* count clean nat cache entries */
count += __count_nat_entries(sbi);
@@ -100,7 +106,10 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
sbi->shrinker_run_no = run_no;
/* shrink extent cache entries */
- freed += f2fs_shrink_extent_tree(sbi, nr >> 1);
+ freed += f2fs_shrink_age_extent_tree(sbi, nr >> 2);
+
+ /* shrink read extent cache entries */
+ freed += f2fs_shrink_read_extent_tree(sbi, nr >> 2);
/* shrink clean nat cache entries */
if (freed < nr)
@@ -130,7 +139,9 @@ void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
void f2fs_leave_shrinker(struct f2fs_sb_info *sbi)
{
- f2fs_shrink_extent_tree(sbi, __count_extent_cache(sbi));
+ f2fs_shrink_read_extent_tree(sbi, __count_extent_cache(sbi, EX_READ));
+ f2fs_shrink_age_extent_tree(sbi,
+ __count_extent_cache(sbi, EX_BLOCK_AGE));
spin_lock(&f2fs_list_lock);
list_del_init(&sbi->s_list);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3834ead04620..1f812b9ce985 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -61,6 +61,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_SLAB_ALLOC] = "slab alloc",
[FAULT_DQUOT_INIT] = "dquot initialize",
[FAULT_LOCK_OP] = "lock_op",
+ [FAULT_BLKADDR] = "invalid blkaddr",
};
void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -110,6 +111,7 @@ enum {
Opt_noinline_dentry,
Opt_flush_merge,
Opt_noflush_merge,
+ Opt_barrier,
Opt_nobarrier,
Opt_fastboot,
Opt_extent_cache,
@@ -161,6 +163,7 @@ enum {
Opt_nogc_merge,
Opt_discard_unit,
Opt_memory_mode,
+ Opt_age_extent_cache,
Opt_err,
};
@@ -186,6 +189,7 @@ static match_table_t f2fs_tokens = {
{Opt_noinline_dentry, "noinline_dentry"},
{Opt_flush_merge, "flush_merge"},
{Opt_noflush_merge, "noflush_merge"},
+ {Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
{Opt_fastboot, "fastboot"},
{Opt_extent_cache, "extent_cache"},
@@ -238,6 +242,7 @@ static match_table_t f2fs_tokens = {
{Opt_nogc_merge, "nogc_merge"},
{Opt_discard_unit, "discard_unit=%s"},
{Opt_memory_mode, "memory=%s"},
+ {Opt_age_extent_cache, "age_extent_cache"},
{Opt_err, NULL},
};
@@ -285,9 +290,7 @@ static int __init f2fs_create_casefold_cache(void)
{
f2fs_cf_name_slab = f2fs_kmem_cache_create("f2fs_casefolded_name",
F2FS_NAME_LEN);
- if (!f2fs_cf_name_slab)
- return -ENOMEM;
- return 0;
+ return f2fs_cf_name_slab ? 0 : -ENOMEM;
}
static void f2fs_destroy_casefold_cache(void)
@@ -806,14 +809,17 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
case Opt_nobarrier:
set_opt(sbi, NOBARRIER);
break;
+ case Opt_barrier:
+ clear_opt(sbi, NOBARRIER);
+ break;
case Opt_fastboot:
set_opt(sbi, FASTBOOT);
break;
case Opt_extent_cache:
- set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noextent_cache:
- clear_opt(sbi, EXTENT_CACHE);
+ clear_opt(sbi, READ_EXTENT_CACHE);
break;
case Opt_noinline_data:
clear_opt(sbi, INLINE_DATA);
@@ -1253,6 +1259,9 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
}
kfree(name);
break;
+ case Opt_age_extent_cache:
+ set_opt(sbi, AGE_EXTENT_CACHE);
+ break;
default:
f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
p);
@@ -1347,6 +1356,11 @@ default_check:
return -EINVAL;
}
+ if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) {
+ f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
+ return -EINVAL;
+ }
+
if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) {
f2fs_err(sbi, "Allow to mount readonly mode only");
return -EROFS;
@@ -1567,8 +1581,7 @@ static void f2fs_put_super(struct super_block *sb)
/* be sure to wait for any on-going discard commands */
dropped = f2fs_issue_discard_timeout(sbi);
- if ((f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) &&
- !sbi->discard_blks && !dropped) {
+ if (f2fs_realtime_discard_enable(sbi) && !sbi->discard_blks && !dropped) {
struct cp_control cpc = {
.reason = CP_UMOUNT | CP_TRIMMED,
};
@@ -1935,16 +1948,22 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",inline_dentry");
else
seq_puts(seq, ",noinline_dentry");
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
+ if (test_opt(sbi, FLUSH_MERGE))
seq_puts(seq, ",flush_merge");
+ else
+ seq_puts(seq, ",noflush_merge");
if (test_opt(sbi, NOBARRIER))
seq_puts(seq, ",nobarrier");
+ else
+ seq_puts(seq, ",barrier");
if (test_opt(sbi, FASTBOOT))
seq_puts(seq, ",fastboot");
- if (test_opt(sbi, EXTENT_CACHE))
+ if (test_opt(sbi, READ_EXTENT_CACHE))
seq_puts(seq, ",extent_cache");
else
seq_puts(seq, ",noextent_cache");
+ if (test_opt(sbi, AGE_EXTENT_CACHE))
+ seq_puts(seq, ",age_extent_cache");
if (test_opt(sbi, DATA_FLUSH))
seq_puts(seq, ",data_flush");
@@ -2043,7 +2062,11 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).active_logs = NR_CURSEG_PERSIST_TYPE;
F2FS_OPTION(sbi).inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
+ if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main) <=
+ SMALL_VOLUME_SEGMENTS)
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ else
+ F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
@@ -2059,13 +2082,14 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
- set_opt(sbi, EXTENT_CACHE);
+ set_opt(sbi, READ_EXTENT_CACHE);
set_opt(sbi, NOHEAP);
clear_opt(sbi, DISABLE_CHECKPOINT);
set_opt(sbi, MERGE_CHECKPOINT);
F2FS_OPTION(sbi).unusable_cap = 0;
sbi->sb->s_flags |= SB_LAZYTIME;
- set_opt(sbi, FLUSH_MERGE);
+ if (!f2fs_is_readonly(sbi))
+ set_opt(sbi, FLUSH_MERGE);
if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi))
set_opt(sbi, DISCARD);
if (f2fs_sb_has_blkzoned(sbi)) {
@@ -2200,14 +2224,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool need_restart_ckpt = false, need_stop_ckpt = false;
bool need_restart_flush = false, need_stop_flush = false;
bool need_restart_discard = false, need_stop_discard = false;
- bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
+ bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
+ bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
bool no_io_align = !F2FS_IO_ALIGNED(sbi);
bool no_atgc = !test_opt(sbi, ATGC);
bool no_discard = !test_opt(sbi, DISCARD);
bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
bool block_unit_discard = f2fs_block_unit_discard(sbi);
- struct discard_cmd_control *dcc;
#ifdef CONFIG_QUOTA
int i, j;
#endif
@@ -2290,11 +2314,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
/* disallow enable/disable extent_cache dynamically */
- if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+ if (no_read_extent_cache == !!test_opt(sbi, READ_EXTENT_CACHE)) {
err = -EINVAL;
f2fs_warn(sbi, "switch extent_cache option is not allowed");
goto restore_opts;
}
+ /* disallow enable/disable age extent_cache dynamically */
+ if (no_age_extent_cache == !!test_opt(sbi, AGE_EXTENT_CACHE)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "switch age_extent_cache option is not allowed");
+ goto restore_opts;
+ }
if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
err = -EINVAL;
@@ -2388,10 +2418,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_flush;
need_stop_discard = true;
} else {
- dcc = SM_I(sbi)->dcc_info;
f2fs_stop_discard_thread(sbi);
- if (atomic_read(&dcc->discard_cmd_cnt))
- f2fs_issue_discard_timeout(sbi);
+ f2fs_issue_discard_timeout(sbi);
need_restart_discard = true;
}
}
@@ -3616,7 +3644,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->seq_file_ra_mul = MIN_RA_MUL;
sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
- spin_lock_init(&sbi->gc_urgent_high_lock);
+ spin_lock_init(&sbi->gc_remaining_trials_lock);
atomic64_set(&sbi->current_atomic_write, 0);
sbi->dir_level = DEF_DIR_LEVEL;
@@ -4056,18 +4084,16 @@ static int f2fs_setup_casefold(struct f2fs_sb_info *sbi)
static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
{
- struct f2fs_sm_info *sm_i = SM_I(sbi);
-
/* adjust parameters according to the volume size */
- if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) {
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
+ if (MAIN_SEGS(sbi) <= SMALL_VOLUME_SEGMENTS) {
if (f2fs_block_unit_discard(sbi))
- sm_i->dcc_info->discard_granularity = 1;
- sm_i->ipu_policy = 1 << F2FS_IPU_FORCE |
+ SM_I(sbi)->dcc_info->discard_granularity =
+ MIN_DISCARD_GRANULARITY;
+ SM_I(sbi)->ipu_policy = 1 << F2FS_IPU_FORCE |
1 << F2FS_IPU_HONOR_OPU_WRITE;
}
- sbi->readdir_ra = 1;
+ sbi->readdir_ra = true;
}
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
@@ -4095,6 +4121,24 @@ try_onemore:
sbi->sb = sb;
+ /* initialize locks within allocated memory */
+ init_f2fs_rwsem(&sbi->gc_lock);
+ mutex_init(&sbi->writepages);
+ init_f2fs_rwsem(&sbi->cp_global_sem);
+ init_f2fs_rwsem(&sbi->node_write);
+ init_f2fs_rwsem(&sbi->node_change);
+ spin_lock_init(&sbi->stat_lock);
+ init_f2fs_rwsem(&sbi->cp_rwsem);
+ init_f2fs_rwsem(&sbi->quota_sem);
+ init_waitqueue_head(&sbi->cp_wait);
+ spin_lock_init(&sbi->error_lock);
+
+ for (i = 0; i < NR_INODE_TYPE; i++) {
+ INIT_LIST_HEAD(&sbi->inode_list[i]);
+ spin_lock_init(&sbi->inode_lock[i]);
+ }
+ mutex_init(&sbi->flush_lock);
+
/* Load the checksum driver */
sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
@@ -4118,6 +4162,8 @@ try_onemore:
sb->s_fs_info = sbi;
sbi->raw_super = raw_super;
+ memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
+
/* precompute checksum seed for metadata */
if (f2fs_sb_has_inode_chksum(sbi))
sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
@@ -4174,23 +4220,14 @@ try_onemore:
/* init f2fs-specific super block info */
sbi->valid_super_block = valid_super_block;
- init_f2fs_rwsem(&sbi->gc_lock);
- mutex_init(&sbi->writepages);
- init_f2fs_rwsem(&sbi->cp_global_sem);
- init_f2fs_rwsem(&sbi->node_write);
- init_f2fs_rwsem(&sbi->node_change);
/* disallow all the data/node/meta page writes */
set_sbi_flag(sbi, SBI_POR_DOING);
- spin_lock_init(&sbi->stat_lock);
err = f2fs_init_write_merge_io(sbi);
if (err)
goto free_bio_info;
- init_f2fs_rwsem(&sbi->cp_rwsem);
- init_f2fs_rwsem(&sbi->quota_sem);
- init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
err = f2fs_init_iostat(sbi);
@@ -4255,9 +4292,6 @@ try_onemore:
goto free_devices;
}
- spin_lock_init(&sbi->error_lock);
- memcpy(sbi->errors, raw_super->s_errors, MAX_F2FS_ERRORS);
-
sbi->total_valid_node_count =
le32_to_cpu(sbi->ckpt->valid_node_count);
percpu_counter_set(&sbi->total_valid_inode_count,
@@ -4271,12 +4305,6 @@ try_onemore:
limit_reserve_root(sbi);
adjust_unusable_cap_perc(sbi);
- for (i = 0; i < NR_INODE_TYPE; i++) {
- INIT_LIST_HEAD(&sbi->inode_list[i]);
- spin_lock_init(&sbi->inode_lock[i]);
- }
- mutex_init(&sbi->flush_lock);
-
f2fs_init_extent_cache_info(sbi);
f2fs_init_ino_entry_info(sbi);
@@ -4523,9 +4551,9 @@ free_nm:
f2fs_destroy_node_manager(sbi);
free_sm:
f2fs_destroy_segment_manager(sbi);
- f2fs_destroy_post_read_wq(sbi);
stop_ckpt_thread:
f2fs_stop_ckpt_thread(sbi);
+ f2fs_destroy_post_read_wq(sbi);
free_devices:
destroy_device_list(sbi);
kvfree(sbi->ckpt);
@@ -4626,9 +4654,7 @@ static int __init init_inodecache(void)
f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
sizeof(struct f2fs_inode_info), 0,
SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
- if (!f2fs_inode_cachep)
- return -ENOMEM;
- return 0;
+ return f2fs_inode_cachep ? 0 : -ENOMEM;
}
static void destroy_inodecache(void)
@@ -4693,7 +4719,7 @@ static int __init init_f2fs_fs(void)
goto free_iostat;
err = f2fs_init_bioset();
if (err)
- goto free_bio_enrty_cache;
+ goto free_bio_entry_cache;
err = f2fs_init_compress_mempool();
if (err)
goto free_bioset;
@@ -4710,7 +4736,7 @@ free_compress_mempool:
f2fs_destroy_compress_mempool();
free_bioset:
f2fs_destroy_bioset();
-free_bio_enrty_cache:
+free_bio_entry_cache:
f2fs_destroy_bio_entry_cache();
free_iostat:
f2fs_destroy_iostat_processing();
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index df27afd71ef4..83a366f3ee80 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -53,9 +53,9 @@ static const char *gc_mode_names[MAX_GC_MODE] = {
struct f2fs_attr {
struct attribute attr;
- ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
- ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
- const char *, size_t);
+ ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf);
+ ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi,
+ const char *buf, size_t len);
int struct_type;
int offset;
int id;
@@ -95,28 +95,28 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
static ssize_t dirty_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(dirty_segments(sbi)));
}
static ssize_t free_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(free_segments(sbi)));
}
static ssize_t ovp_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(overprovision_segments(sbi)));
}
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(sbi->kbytes_written +
((f2fs_get_sectors_written(sbi) -
sbi->sectors_written_start) >> 1)));
@@ -125,13 +125,13 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
static ssize_t sb_status_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%lx\n", sbi->s_flag);
+ return sysfs_emit(buf, "%lx\n", sbi->s_flag);
}
static ssize_t cp_status_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
+ return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
}
static ssize_t pending_discard_show(struct f2fs_attr *a,
@@ -139,10 +139,16 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
{
if (!SM_I(sbi)->dcc_info)
return -EINVAL;
- return sprintf(buf, "%llu\n", (unsigned long long)atomic_read(
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
&SM_I(sbi)->dcc_info->discard_cmd_cnt));
}
+static ssize_t gc_mode_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]);
+}
+
static ssize_t features_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -199,7 +205,7 @@ static ssize_t features_show(struct f2fs_attr *a,
static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%u\n", sbi->current_reserved_blocks);
+ return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks);
}
static ssize_t unusable_show(struct f2fs_attr *a,
@@ -211,7 +217,7 @@ static ssize_t unusable_show(struct f2fs_attr *a,
unusable = sbi->unusable_block_count;
else
unusable = f2fs_get_unusable_blocks(sbi);
- return sprintf(buf, "%llu\n", (unsigned long long)unusable);
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable);
}
static ssize_t encoding_show(struct f2fs_attr *a,
@@ -226,13 +232,13 @@ static ssize_t encoding_show(struct f2fs_attr *a,
(sb->s_encoding->version >> 8) & 0xff,
sb->s_encoding->version & 0xff);
#endif
- return sprintf(buf, "(none)");
+ return sysfs_emit(buf, "(none)\n");
}
static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+ return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time);
}
#ifdef CONFIG_F2FS_STAT_FS
@@ -241,7 +247,7 @@ static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(si->tot_blks -
(si->bg_data_blks + si->bg_node_blks)));
}
@@ -251,7 +257,7 @@ static ssize_t moved_blocks_background_show(struct f2fs_attr *a,
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- return sprintf(buf, "%llu\n",
+ return sysfs_emit(buf, "%llu\n",
(unsigned long long)(si->bg_data_blks + si->bg_node_blks));
}
@@ -262,7 +268,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a,
si->dirty_count = dirty_segments(sbi);
f2fs_update_sit_info(sbi);
- return sprintf(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
}
#endif
@@ -332,13 +338,8 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
- if (!strcmp(a->attr.name, "gc_urgent"))
- return sysfs_emit(buf, "%s\n",
- gc_mode_names[sbi->gc_mode]);
-
if (!strcmp(a->attr.name, "gc_segment_mode"))
- return sysfs_emit(buf, "%s\n",
- gc_mode_names[sbi->gc_segment_mode]);
+ return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return sysfs_emit(buf, "%u\n",
@@ -362,7 +363,7 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
ui = (unsigned int *)(ptr + a->offset);
- return sprintf(buf, "%u\n", *ui);
+ return sysfs_emit(buf, "%u\n", *ui);
}
static ssize_t __sbi_store(struct f2fs_attr *a,
@@ -483,14 +484,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "max_ordered_discard")) {
+ if (t == 0 || t > MAX_PLIST_NUM)
+ return -EINVAL;
+ if (!f2fs_block_unit_discard(sbi))
+ return -EINVAL;
+ *ui = t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "discard_urgent_util")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "migration_granularity")) {
if (t == 0 || t > sbi->segs_per_sec)
return -EINVAL;
}
- if (!strcmp(a->attr.name, "trim_sections"))
- return -EINVAL;
-
if (!strcmp(a->attr.name, "gc_urgent")) {
if (t == 0) {
sbi->gc_mode = GC_NORMAL;
@@ -531,10 +545,10 @@ out:
return count;
}
- if (!strcmp(a->attr.name, "gc_urgent_high_remaining")) {
- spin_lock(&sbi->gc_urgent_high_lock);
- sbi->gc_urgent_high_remaining = t;
- spin_unlock(&sbi->gc_urgent_high_lock);
+ if (!strcmp(a->attr.name, "gc_remaining_trials")) {
+ spin_lock(&sbi->gc_remaining_trials_lock);
+ sbi->gc_remaining_trials = t;
+ spin_unlock(&sbi->gc_remaining_trials_lock);
return count;
}
@@ -649,6 +663,29 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "readdir_ra")) {
+ sbi->readdir_ra = !!t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "hot_data_age_threshold")) {
+ if (t == 0 || t >= sbi->warm_data_age_threshold)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "warm_data_age_threshold")) {
+ if (t == 0 || t <= sbi->hot_data_age_threshold)
+ return -EINVAL;
+ if (t == *ui)
+ return count;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -721,7 +758,7 @@ static void f2fs_sb_release(struct kobject *kobj)
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- return sprintf(buf, "supported\n");
+ return sysfs_emit(buf, "supported\n");
}
#define F2FS_FEATURE_RO_ATTR(_name) \
@@ -734,8 +771,8 @@ static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
if (F2FS_HAS_FEATURE(sbi, a->id))
- return sprintf(buf, "supported\n");
- return sprintf(buf, "unsupported\n");
+ return sysfs_emit(buf, "supported\n");
+ return sysfs_emit(buf, "unsupported\n");
}
#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \
@@ -788,9 +825,10 @@ F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_request, max_discard_req
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, min_discard_issue_time, min_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, mid_discard_issue_time, mid_discard_issue_time);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_discard_issue_time, max_discard_issue_time);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_urgent_util, discard_urgent_util);
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity);
+F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_ordered_discard, max_ordered_discard);
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks);
-F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -825,7 +863,7 @@ F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
#endif
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, data_io_flag, data_io_flag);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_urgent_high_remaining, gc_urgent_high_remaining);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_remaining_trials, gc_remaining_trials);
F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
@@ -838,6 +876,7 @@ F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
+F2FS_GENERAL_RO_ATTR(gc_mode);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -902,6 +941,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, peak_atomic_write, peak_atomic_write);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, committed_atomic_block, committed_atomic_block);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, revoked_atomic_block, revoked_atomic_block);
+/* For block age extent cache */
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, hot_data_age_threshold, hot_data_age_threshold);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, warm_data_age_threshold, warm_data_age_threshold);
+
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_urgent_sleep_time),
@@ -917,9 +960,11 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(min_discard_issue_time),
ATTR_LIST(mid_discard_issue_time),
ATTR_LIST(max_discard_issue_time),
+ ATTR_LIST(discard_urgent_util),
ATTR_LIST(discard_granularity),
+ ATTR_LIST(max_ordered_discard),
ATTR_LIST(pending_discard),
- ATTR_LIST(batched_trim_sections),
+ ATTR_LIST(gc_mode),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(min_fsync_blocks),
@@ -952,7 +997,7 @@ static struct attribute *f2fs_attrs[] = {
#endif
ATTR_LIST(data_io_flag),
ATTR_LIST(node_io_flag),
- ATTR_LIST(gc_urgent_high_remaining),
+ ATTR_LIST(gc_remaining_trials),
ATTR_LIST(ckpt_thread_ioprio),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
@@ -995,6 +1040,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(peak_atomic_write),
ATTR_LIST(committed_atomic_block),
ATTR_LIST(revoked_atomic_block),
+ ATTR_LIST(hot_data_age_threshold),
+ ATTR_LIST(warm_data_age_threshold),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
@@ -1243,6 +1290,44 @@ static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
return 0;
}
+static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+ int i, count;
+
+ seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n");
+ if (!f2fs_realtime_discard_enable(sbi))
+ return 0;
+
+ if (dcc) {
+ mutex_lock(&dcc->cmd_lock);
+ for (i = 0; i < MAX_PLIST_NUM; i++) {
+ struct list_head *pend_list;
+ struct discard_cmd *dc, *tmp;
+
+ if (i % 8 == 0)
+ seq_printf(seq, " %-3d", i);
+ count = 0;
+ pend_list = &dcc->pend_list[i];
+ list_for_each_entry_safe(dc, tmp, pend_list, list)
+ count++;
+ if (count)
+ seq_printf(seq, " %7d", count);
+ else
+ seq_puts(seq, " .");
+ if (i % 8 == 7)
+ seq_putc(seq, '\n');
+ }
+ seq_putc(seq, '\n');
+ mutex_unlock(&dcc->cmd_lock);
+ }
+
+ return 0;
+}
+
int __init f2fs_init_sysfs(void)
{
int ret;
@@ -1313,6 +1398,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
#endif
proc_create_single_data("victim_bits", 0444, sbi->s_proc,
victim_bits_seq_show, sb);
+ proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
+ discard_plist_seq_show, sb);
}
return 0;
put_feature_list_kobj:
@@ -1336,6 +1423,7 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
remove_proc_entry("segment_info", sbi->s_proc);
remove_proc_entry("segment_bits", sbi->s_proc);
remove_proc_entry("victim_bits", sbi->s_proc);
+ remove_proc_entry("discard_plist_info", sbi->s_proc);
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9958d4020771..6fba5a52127b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -121,6 +121,7 @@ static bool inode_io_list_move_locked(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
list_move(&inode->i_io_list, head);
@@ -280,6 +281,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
inode->i_state &= ~I_SYNC_QUEUED;
if (wb != &wb->bdi->wb)
@@ -1129,6 +1131,7 @@ static void inode_cgwb_move_to_attached(struct inode *inode,
{
assert_spin_locked(&wb->list_lock);
assert_spin_locked(&inode->i_lock);
+ WARN_ON_ONCE(inode->i_state & I_FREEING);
inode->i_state &= ~I_SYNC_QUEUED;
list_del_init(&inode->i_io_list);
@@ -1294,6 +1297,17 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
assert_spin_locked(&inode->i_lock);
+ inode->i_state &= ~I_SYNC_QUEUED;
+ /*
+ * When the inode is being freed just don't bother with dirty list
+ * tracking. Flush worker will ignore this inode anyway and it will
+ * trigger assertions in inode_io_list_move_locked().
+ */
+ if (inode->i_state & I_FREEING) {
+ list_del_init(&inode->i_io_list);
+ wb_io_lists_depopulated(wb);
+ return;
+ }
if (!list_empty(&wb->b_dirty)) {
struct inode *tail;
@@ -1302,7 +1316,6 @@ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
inode->dirtied_when = jiffies;
}
inode_io_list_move_locked(inode, wb, &wb->b_dirty);
- inode->i_state &= ~I_SYNC_QUEUED;
}
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
@@ -1345,8 +1358,6 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
return ret;
}
-#define EXPIRE_DIRTY_ATIME 0x0001
-
/*
* Move expired (dirtied before dirtied_before) dirty inodes from
* @delaying_queue to @dispatch_queue.
diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c
index a4850aee2639..ad670369955f 100644
--- a/fs/fuse/acl.c
+++ b/fs/fuse/acl.c
@@ -11,9 +11,10 @@
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
-struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
+static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc,
+ struct user_namespace *mnt_userns,
+ struct inode *inode, int type, bool rcu)
{
- struct fuse_conn *fc = get_fuse_conn(inode);
int size;
const char *name;
void *value = NULL;
@@ -25,7 +26,7 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
if (fuse_is_bad(inode))
return ERR_PTR(-EIO);
- if (!fc->posix_acl || fc->no_getxattr)
+ if (fc->no_getxattr)
return NULL;
if (type == ACL_TYPE_ACCESS)
@@ -53,6 +54,46 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
+static inline bool fuse_no_acl(const struct fuse_conn *fc,
+ const struct inode *inode)
+{
+ /*
+ * Refuse interacting with POSIX ACLs for daemons that
+ * don't support FUSE_POSIX_ACL and are not mounted on
+ * the host to retain backwards compatibility.
+ */
+ return !fc->posix_acl && (i_user_ns(inode) != &init_user_ns);
+}
+
+struct posix_acl *fuse_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (fuse_no_acl(fc, inode))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return __fuse_get_acl(fc, mnt_userns, inode, type, false);
+}
+
+struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /*
+ * FUSE daemons before FUSE_POSIX_ACL was introduced could get and set
+ * POSIX ACLs without them being used for permission checking by the
+ * vfs. Retain that behavior for backwards compatibility as there are
+ * filesystems that do all permission checking for acls in the daemon
+ * and not in the kernel.
+ */
+ if (!fc->posix_acl)
+ return NULL;
+
+ return __fuse_get_acl(fc, &init_user_ns, inode, type, rcu);
+}
+
int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type)
{
@@ -64,7 +105,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
if (fuse_is_bad(inode))
return -EIO;
- if (!fc->posix_acl || fc->no_setxattr)
+ if (fc->no_setxattr || fuse_no_acl(fc, inode))
return -EOPNOTSUPP;
if (type == ACL_TYPE_ACCESS)
@@ -99,7 +140,13 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
return ret;
}
- if (!vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) &&
+ /*
+ * Fuse daemons without FUSE_POSIX_ACL never changed the passed
+ * through POSIX ACLs. Such daemons don't expect setgid bits to
+ * be stripped.
+ */
+ if (fc->posix_acl &&
+ !vfsgid_in_group_p(i_gid_into_vfsgid(&init_user_ns, inode)) &&
!capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID))
extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID;
@@ -108,8 +155,15 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
} else {
ret = fuse_removexattr(inode, name);
}
- forget_all_cached_acls(inode);
- fuse_invalidate_attr(inode);
+
+ if (fc->posix_acl) {
+ /*
+ * Fuse daemons without FUSE_POSIX_ACL never cached POSIX ACLs
+ * and didn't invalidate attributes. Retain that behavior.
+ */
+ forget_all_cached_acls(inode);
+ fuse_invalidate_attr(inode);
+ }
return ret;
}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index cd1a071b625a..2725fb54328e 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1942,7 +1942,8 @@ static const struct inode_operations fuse_dir_inode_operations = {
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
- .get_inode_acl = fuse_get_acl,
+ .get_inode_acl = fuse_get_inode_acl,
+ .get_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
.fileattr_get = fuse_fileattr_get,
.fileattr_set = fuse_fileattr_set,
@@ -1964,7 +1965,8 @@ static const struct inode_operations fuse_common_inode_operations = {
.permission = fuse_permission,
.getattr = fuse_getattr,
.listxattr = fuse_listxattr,
- .get_inode_acl = fuse_get_acl,
+ .get_inode_acl = fuse_get_inode_acl,
+ .get_acl = fuse_get_acl,
.set_acl = fuse_set_acl,
.fileattr_get = fuse_fileattr_get,
.fileattr_set = fuse_fileattr_set,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index c673faefdcb9..46797a171a84 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1264,11 +1264,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size);
int fuse_removexattr(struct inode *inode, const char *name);
extern const struct xattr_handler *fuse_xattr_handlers[];
-extern const struct xattr_handler *fuse_acl_xattr_handlers[];
-extern const struct xattr_handler *fuse_no_acl_xattr_handlers[];
struct posix_acl;
-struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu);
+struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu);
+struct posix_acl *fuse_get_acl(struct user_namespace *mnt_userns,
+ struct dentry *dentry, int type);
int fuse_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6b3beda16c1b..de9b9ec5ce81 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -311,7 +311,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
fuse_dax_dontcache(inode, attr->flags);
}
-static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
+ struct fuse_conn *fc)
{
inode->i_mode = attr->mode & S_IFMT;
inode->i_size = attr->size;
@@ -333,6 +334,12 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
new_decode_dev(attr->rdev));
} else
BUG();
+ /*
+ * Ensure that we don't cache acls for daemons without FUSE_POSIX_ACL
+ * so they see the exact same behavior as before.
+ */
+ if (!fc->posix_acl)
+ inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
}
static int fuse_inode_eq(struct inode *inode, void *_nodeidp)
@@ -372,7 +379,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
if (!inode)
return NULL;
- fuse_init_inode(inode, attr);
+ fuse_init_inode(inode, attr, fc);
get_fuse_inode(inode)->nodeid = nodeid;
inode->i_flags |= S_AUTOMOUNT;
goto done;
@@ -388,7 +395,7 @@ retry:
if (!fc->writeback_cache || !S_ISREG(attr->mode))
inode->i_flags |= S_NOCMTIME;
inode->i_generation = generation;
- fuse_init_inode(inode, attr);
+ fuse_init_inode(inode, attr, fc);
unlock_new_inode(inode);
} else if (fuse_stale_inode(inode, generation, attr)) {
/* nodeid was reused, any I/O on the old inode should fail */
@@ -1174,7 +1181,6 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
if ((flags & FUSE_POSIX_ACL)) {
fc->default_permissions = 1;
fc->posix_acl = 1;
- fm->sb->s_xattr = fuse_acl_xattr_handlers;
}
if (flags & FUSE_CACHE_SYMLINKS)
fc->cache_symlinks = 1;
@@ -1420,13 +1426,6 @@ static void fuse_sb_defaults(struct super_block *sb)
if (sb->s_user_ns != &init_user_ns)
sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
-
- /*
- * If we are not in the initial user namespace posix
- * acls must be translated.
- */
- if (sb->s_user_ns != &init_user_ns)
- sb->s_xattr = fuse_no_acl_xattr_handlers;
}
static int fuse_fill_super_submount(struct super_block *sb,
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 0d3e7177fce0..9fe571ab569e 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -203,27 +203,6 @@ static int fuse_xattr_set(const struct xattr_handler *handler,
return fuse_setxattr(inode, name, value, size, flags, 0);
}
-static bool no_xattr_list(struct dentry *dentry)
-{
- return false;
-}
-
-static int no_xattr_get(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, void *value, size_t size)
-{
- return -EOPNOTSUPP;
-}
-
-static int no_xattr_set(const struct xattr_handler *handler,
- struct user_namespace *mnt_userns,
- struct dentry *dentry, struct inode *nodee,
- const char *name, const void *value,
- size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
static const struct xattr_handler fuse_xattr_handler = {
.prefix = "",
.get = fuse_xattr_get,
@@ -234,33 +213,3 @@ const struct xattr_handler *fuse_xattr_handlers[] = {
&fuse_xattr_handler,
NULL
};
-
-const struct xattr_handler *fuse_acl_xattr_handlers[] = {
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
- &fuse_xattr_handler,
- NULL
-};
-
-static const struct xattr_handler fuse_no_acl_access_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = no_xattr_list,
- .get = no_xattr_get,
- .set = no_xattr_set,
-};
-
-static const struct xattr_handler fuse_no_acl_default_xattr_handler = {
- .name = XATTR_NAME_POSIX_ACL_DEFAULT,
- .flags = ACL_TYPE_ACCESS,
- .list = no_xattr_list,
- .get = no_xattr_get,
- .set = no_xattr_set,
-};
-
-const struct xattr_handler *fuse_no_acl_xattr_handlers[] = {
- &fuse_no_acl_access_xattr_handler,
- &fuse_no_acl_default_xattr_handler,
- &fuse_xattr_handler,
- NULL
-};
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 05bee80ac7de..e782b4f1d104 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -427,8 +427,6 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
return error;
kaddr = kmap_atomic(page);
- if (dsize > gfs2_max_stuffed_size(ip))
- dsize = gfs2_max_stuffed_size(ip);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap_atomic(kaddr);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3bdb2c668a71..e7537fd305dd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -61,9 +61,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
void *kaddr = kmap(page);
u64 dsize = i_size_read(inode);
- if (dsize > gfs2_max_stuffed_size(ip))
- dsize = gfs2_max_stuffed_size(ip);
-
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap(page);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 60c6fb91fb58..eea5be4fbf0e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1445,14 +1445,13 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh)
{
- struct gfs2_glock *gl = fl_gh->gh_gl;
+ struct gfs2_glock *gl = gfs2_glock_hold(fl_gh->gh_gl);
/*
* Make sure gfs2_glock_put() won't sleep under the file->f_lock
* spinlock.
*/
- gfs2_glock_hold(gl);
spin_lock(&file->f_lock);
gfs2_holder_uninit(fl_gh);
spin_unlock(&file->f_lock);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index df335c258eb0..524f3c96b9a4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -186,10 +186,11 @@ void gfs2_glock_free(struct gfs2_glock *gl)
*
*/
-void gfs2_glock_hold(struct gfs2_glock *gl)
+struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl)
{
GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
lockref_get(&gl->gl_lockref);
+ return gl;
}
/**
@@ -205,12 +206,6 @@ static int demote_ok(const struct gfs2_glock *gl)
if (gl->gl_state == LM_ST_UNLOCKED)
return 0;
- /*
- * Note that demote_ok is used for the lru process of disposing of
- * glocks. For this purpose, we don't care if the glock's holders
- * have the HIF_MAY_DEMOTE flag set or not. If someone is using
- * them, don't demote.
- */
if (!list_empty(&gl->gl_holders))
return 0;
if (glops->go_demote_ok)
@@ -393,7 +388,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
struct gfs2_holder *gh, *tmp;
list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- if (!test_bit(HIF_WAIT, &gh->gh_iflags))
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
if (ret & LM_OUT_ERROR)
gh->gh_error = -EIO;
@@ -408,45 +403,6 @@ static void do_error(struct gfs2_glock *gl, const int ret)
}
/**
- * demote_incompat_holders - demote incompatible demoteable holders
- * @gl: the glock we want to promote
- * @current_gh: the newly promoted holder
- *
- * We're passing the newly promoted holder in @current_gh, but actually, any of
- * the strong holders would do.
- */
-static void demote_incompat_holders(struct gfs2_glock *gl,
- struct gfs2_holder *current_gh)
-{
- struct gfs2_holder *gh, *tmp;
-
- /*
- * Demote incompatible holders before we make ourselves eligible.
- * (This holder may or may not allow auto-demoting, but we don't want
- * to demote the new holder before it's even granted.)
- */
- list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
- /*
- * Since holders are at the front of the list, we stop when we
- * find the first non-holder.
- */
- if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
- return;
- if (gh == current_gh)
- continue;
- if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
- !may_grant(gl, current_gh, gh)) {
- /*
- * We should not recurse into do_promote because
- * __gfs2_glock_dq only calls handle_callback,
- * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
- */
- __gfs2_glock_dq(gh);
- }
- }
-}
-
-/**
* find_first_holder - find the first "holder" gh
* @gl: the glock
*/
@@ -464,26 +420,6 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
return NULL;
}
-/**
- * find_first_strong_holder - find the first non-demoteable holder
- * @gl: the glock
- *
- * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
- */
-static inline struct gfs2_holder *
-find_first_strong_holder(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
-
- list_for_each_entry(gh, &gl->gl_holders, gh_list) {
- if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
- return NULL;
- if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
- return gh;
- }
- return NULL;
-}
-
/*
* gfs2_instantiate - Call the glops instantiate function
* @gh: The glock holder
@@ -540,9 +476,8 @@ done:
static int do_promote(struct gfs2_glock *gl)
{
struct gfs2_holder *gh, *current_gh;
- bool incompat_holders_demoted = false;
- current_gh = find_first_strong_holder(gl);
+ current_gh = find_first_holder(gl);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (test_bit(HIF_HOLDER, &gh->gh_iflags))
continue;
@@ -561,11 +496,8 @@ static int do_promote(struct gfs2_glock *gl)
set_bit(HIF_HOLDER, &gh->gh_iflags);
trace_gfs2_promote(gh);
gfs2_holder_wake(gh);
- if (!incompat_holders_demoted) {
+ if (!current_gh)
current_gh = gh;
- demote_incompat_holders(gl, current_gh);
- incompat_holders_demoted = true;
- }
}
return 0;
}
@@ -927,6 +859,48 @@ out_unlock:
return;
}
+/**
+ * glock_set_object - set the gl_object field of a glock
+ * @gl: the glock
+ * @object: the object
+ */
+void glock_set_object(struct gfs2_glock *gl, void *object)
+{
+ void *prev_object;
+
+ spin_lock(&gl->gl_lockref.lock);
+ prev_object = gl->gl_object;
+ gl->gl_object = object;
+ spin_unlock(&gl->gl_lockref.lock);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) {
+ pr_warn("glock=%u/%llx\n",
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ gfs2_dump_glock(NULL, gl, true);
+ }
+}
+
+/**
+ * glock_clear_object - clear the gl_object field of a glock
+ * @gl: the glock
+ */
+void glock_clear_object(struct gfs2_glock *gl, void *object)
+{
+ void *prev_object;
+
+ spin_lock(&gl->gl_lockref.lock);
+ prev_object = gl->gl_object;
+ gl->gl_object = NULL;
+ spin_unlock(&gl->gl_lockref.lock);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd,
+ prev_object == object || prev_object == NULL)) {
+ pr_warn("glock=%u/%llx\n",
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number);
+ gfs2_dump_glock(NULL, gl, true);
+ }
+}
+
void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation)
{
struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr;
@@ -980,8 +954,6 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
ip = NULL;
spin_unlock(&gl->gl_lockref.lock);
if (ip) {
- struct gfs2_glock *inode_gl = NULL;
-
gl->gl_no_formal_ino = ip->i_no_formal_ino;
set_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
d_prune_aliases(&ip->i_inode);
@@ -991,14 +963,14 @@ static bool gfs2_try_evict(struct gfs2_glock *gl)
spin_lock(&gl->gl_lockref.lock);
ip = gl->gl_object;
if (ip) {
- inode_gl = ip->i_gl;
- lockref_get(&inode_gl->gl_lockref);
clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags);
+ if (!igrab(&ip->i_inode))
+ ip = NULL;
}
spin_unlock(&gl->gl_lockref.lock);
- if (inode_gl) {
- gfs2_glock_poke(inode_gl);
- gfs2_glock_put(inode_gl);
+ if (ip) {
+ gfs2_glock_poke(ip->i_gl);
+ iput(&ip->i_inode);
}
evicted = !ip;
}
@@ -1039,6 +1011,7 @@ static void delete_work_func(struct work_struct *work)
if (gfs2_queue_delete_work(gl, 5 * HZ))
return;
}
+ goto out;
}
inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino,
@@ -1051,6 +1024,7 @@ static void delete_work_func(struct work_struct *work)
d_prune_aliases(inode);
iput(inode);
}
+out:
gfs2_glock_put(gl);
}
@@ -1256,13 +1230,12 @@ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
struct gfs2_holder *gh, unsigned long ip)
{
INIT_LIST_HEAD(&gh->gh_list);
- gh->gh_gl = gl;
+ gh->gh_gl = gfs2_glock_hold(gl);
gh->gh_ip = ip;
gh->gh_owner_pid = get_pid(task_pid(current));
gh->gh_state = state;
gh->gh_flags = flags;
gh->gh_iflags = 0;
- gfs2_glock_hold(gl);
}
/**
@@ -1496,7 +1469,7 @@ __acquires(&gl->gl_lockref.lock)
if (test_bit(GLF_LOCK, &gl->gl_flags)) {
struct gfs2_holder *current_gh;
- current_gh = find_first_strong_holder(gl);
+ current_gh = find_first_holder(gl);
try_futile = !may_grant(gl, current_gh, gh);
}
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -1508,8 +1481,6 @@ __acquires(&gl->gl_lockref.lock)
continue;
if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK)
continue;
- if (test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))
- continue;
if (!pid_is_meaningful(gh2))
continue;
goto trap_recursive;
@@ -1619,69 +1590,28 @@ static inline bool needs_demote(struct gfs2_glock *gl)
static void __gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
unsigned delay = 0;
int fast_path = 0;
/*
- * This while loop is similar to function demote_incompat_holders:
- * If the glock is due to be demoted (which may be from another node
- * or even if this holder is GL_NOCACHE), the weak holders are
- * demoted as well, allowing the glock to be demoted.
+ * This holder should not be cached, so mark it for demote.
+ * Note: this should be done before the check for needs_demote
+ * below.
*/
- while (gh) {
- /*
- * If we're in the process of file system withdraw, we cannot
- * just dequeue any glocks until our journal is recovered, lest
- * we introduce file system corruption. We need two exceptions
- * to this rule: We need to allow unlocking of nondisk glocks
- * and the glock for our own journal that needs recovery.
- */
- if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
- glock_blocked_by_withdraw(gl) &&
- gh->gh_gl != sdp->sd_jinode_gl) {
- sdp->sd_glock_dqs_held++;
- spin_unlock(&gl->gl_lockref.lock);
- might_sleep();
- wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
- TASK_UNINTERRUPTIBLE);
- spin_lock(&gl->gl_lockref.lock);
- }
+ if (gh->gh_flags & GL_NOCACHE)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- /*
- * This holder should not be cached, so mark it for demote.
- * Note: this should be done before the check for needs_demote
- * below.
- */
- if (gh->gh_flags & GL_NOCACHE)
- handle_callback(gl, LM_ST_UNLOCKED, 0, false);
-
- list_del_init(&gh->gh_list);
- clear_bit(HIF_HOLDER, &gh->gh_iflags);
- trace_gfs2_glock_queue(gh, 0);
+ list_del_init(&gh->gh_list);
+ clear_bit(HIF_HOLDER, &gh->gh_iflags);
+ trace_gfs2_glock_queue(gh, 0);
- /*
- * If there hasn't been a demote request we are done.
- * (Let the remaining holders, if any, keep holding it.)
- */
- if (!needs_demote(gl)) {
- if (list_empty(&gl->gl_holders))
- fast_path = 1;
- break;
- }
- /*
- * If we have another strong holder (we cannot auto-demote)
- * we are done. It keeps holding it until it is done.
- */
- if (find_first_strong_holder(gl))
- break;
-
- /*
- * If we have a weak holder at the head of the list, it
- * (and all others like it) must be auto-demoted. If there
- * are no more weak holders, we exit the while loop.
- */
- gh = find_first_holder(gl);
+ /*
+ * If there hasn't been a demote request we are done.
+ * (Let the remaining holders, if any, keep holding it.)
+ */
+ if (!needs_demote(gl)) {
+ if (list_empty(&gl->gl_holders))
+ fast_path = 1;
}
if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
@@ -1705,8 +1635,17 @@ static void __gfs2_glock_dq(struct gfs2_holder *gh)
void gfs2_glock_dq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
+ struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
spin_lock(&gl->gl_lockref.lock);
+ if (!gfs2_holder_queued(gh)) {
+ /*
+ * May have already been dequeued because the locking request
+ * was GL_ASYNC and it has failed in the meantime.
+ */
+ goto out;
+ }
+
if (list_is_first(&gh->gh_list, &gl->gl_holders) &&
!test_bit(HIF_HOLDER, &gh->gh_iflags)) {
spin_unlock(&gl->gl_lockref.lock);
@@ -1715,7 +1654,26 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
spin_lock(&gl->gl_lockref.lock);
}
+ /*
+ * If we're in the process of file system withdraw, we cannot just
+ * dequeue any glocks until our journal is recovered, lest we introduce
+ * file system corruption. We need two exceptions to this rule: We need
+ * to allow unlocking of nondisk glocks and the glock for our own
+ * journal that needs recovery.
+ */
+ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+ glock_blocked_by_withdraw(gl) &&
+ gh->gh_gl != sdp->sd_jinode_gl) {
+ sdp->sd_glock_dqs_held++;
+ spin_unlock(&gl->gl_lockref.lock);
+ might_sleep();
+ wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+ TASK_UNINTERRUPTIBLE);
+ spin_lock(&gl->gl_lockref.lock);
+ }
+
__gfs2_glock_dq(gh);
+out:
spin_unlock(&gl->gl_lockref.lock);
}
@@ -1888,33 +1846,6 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
delay = gl->gl_hold_time;
}
- /*
- * Note 1: We cannot call demote_incompat_holders from handle_callback
- * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
- * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
- * Plus, we only want to demote the holders if the request comes from
- * a remote cluster node because local holder conflicts are resolved
- * elsewhere.
- *
- * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
- * request that we set our state to UNLOCKED. Here we mock up a holder
- * to make it look like someone wants the lock EX locally. Any SH
- * and DF requests should be able to share the lock without demoting.
- *
- * Note 3: We only want to demote the demoteable holders when there
- * are no more strong holders. The demoteable holders might as well
- * keep the glock until the last strong holder is done with it.
- */
- if (!find_first_strong_holder(gl)) {
- struct gfs2_holder mock_gh = {
- .gh_gl = gl,
- .gh_state = (state == LM_ST_UNLOCKED) ?
- LM_ST_EXCLUSIVE : state,
- .gh_iflags = BIT(HIF_HOLDER)
- };
-
- demote_incompat_holders(gl, &mock_gh);
- }
handle_callback(gl, state, delay, true);
__gfs2_glock_queue_work(gl, delay);
spin_unlock(&gl->gl_lockref.lock);
@@ -2306,8 +2237,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
*p++ = 'H';
if (test_bit(HIF_WAIT, &iflags))
*p++ = 'W';
- if (test_bit(HIF_MAY_DEMOTE, &iflags))
- *p++ = 'D';
if (flags & GL_SKIP)
*p++ = 's';
*p = 0;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 0d068f4fd7d6..f37ac087e2c1 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -156,8 +156,6 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
break;
- if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
- continue;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -196,7 +194,7 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
-extern void gfs2_glock_hold(struct gfs2_glock *gl);
+extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_glock_queue_put(struct gfs2_glock *gl);
@@ -288,6 +286,9 @@ extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
extern void gfs2_register_debugfs(void);
extern void gfs2_unregister_debugfs(void);
+extern void glock_set_object(struct gfs2_glock *gl, void *object);
+extern void glock_clear_object(struct gfs2_glock *gl, void *object);
+
extern const struct lm_lockops gfs2_dlm_ops;
static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh)
@@ -305,64 +306,6 @@ static inline bool gfs2_holder_queued(struct gfs2_holder *gh)
return !list_empty(&gh->gh_list);
}
-/**
- * glock_set_object - set the gl_object field of a glock
- * @gl: the glock
- * @object: the object
- */
-static inline void glock_set_object(struct gfs2_glock *gl, void *object)
-{
- spin_lock(&gl->gl_lockref.lock);
- if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL))
- gfs2_dump_glock(NULL, gl, true);
- gl->gl_object = object;
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-/**
- * glock_clear_object - clear the gl_object field of a glock
- * @gl: the glock
- * @object: the object
- *
- * I'd love to similarly add this:
- * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object))
- * gfs2_dump_glock(NULL, gl, true);
- * Unfortunately, that's not possible because as soon as gfs2_delete_inode
- * frees the block in the rgrp, another process can reassign it for an I_NEW
- * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget.
- * That means gfs2_delete_inode may subsequently try to call this function
- * for a glock that's already pointing to a brand new inode. If we clear the
- * new inode's gl_object, we'll introduce metadata corruption. Function
- * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also
- * tries to clear gl_object, so it's more than just gfs2_delete_inode.
- *
- */
-static inline void glock_clear_object(struct gfs2_glock *gl, void *object)
-{
- spin_lock(&gl->gl_lockref.lock);
- if (gl->gl_object == object)
- gl->gl_object = NULL;
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- spin_lock(&gl->gl_lockref.lock);
- set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
-static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- spin_lock(&gl->gl_lockref.lock);
- clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
- spin_unlock(&gl->gl_lockref.lock);
-}
-
extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49210a2e7ce7..d78b61ecc1cd 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -397,38 +397,39 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
struct timespec64 atime;
u16 height, depth;
umode_t mode = be32_to_cpu(str->di_mode);
- bool is_new = ip->i_inode.i_state & I_NEW;
+ struct inode *inode = &ip->i_inode;
+ bool is_new = inode->i_state & I_NEW;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
goto corrupt;
- if (unlikely(!is_new && inode_wrong_type(&ip->i_inode, mode)))
+ if (unlikely(!is_new && inode_wrong_type(inode, mode)))
goto corrupt;
ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
- ip->i_inode.i_mode = mode;
+ inode->i_mode = mode;
if (is_new) {
- ip->i_inode.i_rdev = 0;
+ inode->i_rdev = 0;
switch (mode & S_IFMT) {
case S_IFBLK:
case S_IFCHR:
- ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major),
- be32_to_cpu(str->di_minor));
+ inode->i_rdev = MKDEV(be32_to_cpu(str->di_major),
+ be32_to_cpu(str->di_minor));
break;
}
}
- i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid));
- i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid));
- set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink));
- i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
- gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
+ i_uid_write(inode, be32_to_cpu(str->di_uid));
+ i_gid_write(inode, be32_to_cpu(str->di_gid));
+ set_nlink(inode, be32_to_cpu(str->di_nlink));
+ i_size_write(inode, be64_to_cpu(str->di_size));
+ gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks));
atime.tv_sec = be64_to_cpu(str->di_atime);
atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
- if (timespec64_compare(&ip->i_inode.i_atime, &atime) < 0)
- ip->i_inode.i_atime = atime;
- ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
- ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
- ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
- ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
+ if (timespec64_compare(&inode->i_atime, &atime) < 0)
+ inode->i_atime = atime;
+ inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
+ inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
+ inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
+ inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
ip->i_goal = be64_to_cpu(str->di_goal_meta);
ip->i_generation = be64_to_cpu(str->di_generation);
@@ -436,7 +437,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_diskflags = be32_to_cpu(str->di_flags);
ip->i_eattr = be64_to_cpu(str->di_eattr);
/* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */
- gfs2_set_inode_flags(&ip->i_inode);
+ gfs2_set_inode_flags(inode);
height = be16_to_cpu(str->di_height);
if (unlikely(height > GFS2_MAX_META_HEIGHT))
goto corrupt;
@@ -448,8 +449,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_depth = (u8)depth;
ip->i_entries = be32_to_cpu(str->di_entries);
- if (S_ISREG(ip->i_inode.i_mode))
- gfs2_set_aops(&ip->i_inode);
+ if (gfs2_is_stuffed(ip) && inode->i_size > gfs2_max_stuffed_size(ip))
+ goto corrupt;
+
+ if (S_ISREG(inode->i_mode))
+ gfs2_set_aops(inode);
return 0;
corrupt:
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index d09d9892cd05..c26765080f28 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -252,7 +252,6 @@ struct gfs2_lkstats {
enum {
/* States */
- HIF_MAY_DEMOTE = 1,
HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_WAIT = 10,
};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 1371e067d2a7..614db3055c02 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -142,6 +142,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
if (unlikely(error))
goto fail;
+ /*
+ * The only caller that sets @blktype to GFS2_BLKST_UNLINKED is
+ * delete_work_func(). Make sure not to cancel the delete work
+ * from within itself here.
+ */
if (blktype == GFS2_BLKST_UNLINKED)
extra_flags |= LM_FLAG_TRY;
else
@@ -403,12 +408,17 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
goto out_ipreserv;
error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
+ if (error)
+ goto out_trans_end;
+
ip->i_no_formal_ino = ip->i_generation;
ip->i_inode.i_ino = ip->i_no_addr;
ip->i_goal = ip->i_no_addr;
+ if (*dblocks > 1)
+ ip->i_eattr = ip->i_no_addr + 1;
+out_trans_end:
gfs2_trans_end(sdp);
-
out_ipreserv:
gfs2_inplace_release(ip);
out_quota:
@@ -586,6 +596,12 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
* @size: The initial size of the inode (ignored for directories)
* @excl: Force fail if inode exists
*
+ * FIXME: Change to allocate the disk blocks and write them out in the same
+ * transaction. That way, we can no longer end up in a situation in which an
+ * inode is allocated, the node crashes, and the block looks like a valid
+ * inode. (With atomic creates in place, we will also no longer need to zero
+ * the link count and dirty the inode here on failure.)
+ *
* Returns: 0 on success, or error code
*/
@@ -596,12 +612,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
{
const struct qstr *name = &dentry->d_name;
struct posix_acl *default_acl, *acl;
- struct gfs2_holder ghs[2];
+ struct gfs2_holder d_gh, gh;
struct inode *inode = NULL;
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
- int error, free_vfs_inode = 1;
+ int error;
u32 aflags = 0;
unsigned blocks = 1;
struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
@@ -617,10 +633,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail;
- error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh);
if (error)
goto fail;
- gfs2_holder_mark_uninitialized(ghs + 1);
+ gfs2_holder_mark_uninitialized(&gh);
error = create_ok(dip, name, mode);
if (error)
@@ -642,7 +658,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
else
error = finish_no_open(file, NULL);
}
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
goto fail;
} else if (error != -ENOENT) {
goto fail_gunlock;
@@ -656,12 +672,12 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = -ENOMEM;
if (!inode)
goto fail_gunlock;
+ ip = GFS2_I(inode);
error = posix_acl_create(dir, &mode, &default_acl, &acl);
if (error)
goto fail_gunlock;
- ip = GFS2_I(inode);
error = gfs2_qa_get(ip);
if (error)
goto fail_free_acls;
@@ -723,15 +739,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
goto fail_free_inode;
gfs2_cancel_delete_work(io_gl);
+retry:
error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr);
- BUG_ON(error);
+ if (error == -EBUSY)
+ goto retry;
+ if (error)
+ goto fail_gunlock2;
error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID,
&ip->i_iopen_gh);
if (error)
goto fail_gunlock2;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (error)
goto fail_gunlock3;
@@ -739,10 +759,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_gunlock3;
- if (blocks > 1) {
- ip->i_eattr = ip->i_no_addr + 1;
+ if (blocks > 1)
gfs2_init_xattr(ip);
- }
init_dinode(dip, ip, symname);
gfs2_trans_end(sdp);
@@ -750,9 +768,6 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
glock_set_object(io_gl, ip);
gfs2_set_iop(inode);
- free_vfs_inode = 0; /* After this point, the inode is no longer
- considered free. Any failures need to undo
- the gfs2 structures. */
if (default_acl) {
error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
if (error)
@@ -785,9 +800,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
file->f_mode |= FMODE_CREATED;
error = finish_open(file, dentry, gfs2_open_common);
}
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
gfs2_qa_put(ip);
- gfs2_glock_dq_uninit(ghs + 1);
+ gfs2_glock_dq_uninit(&gh);
gfs2_glock_put(io_gl);
gfs2_qa_put(dip);
unlock_new_inode(inode);
@@ -801,10 +816,6 @@ fail_gunlock3:
fail_gunlock2:
gfs2_glock_put(io_gl);
fail_free_inode:
- if (ip->i_gl) {
- if (free_vfs_inode) /* else evict will do the put for us */
- gfs2_glock_put(ip->i_gl);
- }
gfs2_rs_deltree(&ip->i_res);
gfs2_qa_put(ip);
fail_free_acls:
@@ -812,20 +823,19 @@ fail_free_acls:
posix_acl_release(acl);
fail_gunlock:
gfs2_dir_no_add(&da);
- gfs2_glock_dq_uninit(ghs);
+ gfs2_glock_dq_uninit(&d_gh);
if (!IS_ERR_OR_NULL(inode)) {
+ set_bit(GIF_ALLOC_FAILED, &ip->i_flags);
clear_nlink(inode);
- if (!free_vfs_inode)
+ if (ip->i_no_addr)
mark_inode_dirty(inode);
- set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
- &GFS2_I(inode)->i_flags);
if (inode->i_state & I_NEW)
iget_failed(inode);
else
iput(inode);
}
- if (gfs2_holder_initialized(ghs + 1))
- gfs2_glock_dq_uninit(ghs + 1);
+ if (gfs2_holder_initialized(&gh))
+ gfs2_glock_dq_uninit(&gh);
fail:
gfs2_qa_put(dip);
return error;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 723639376ae2..61323deb80bc 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -80,6 +80,15 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
brelse(bd->bd_bh);
}
+static int __gfs2_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct address_space *mapping = data;
+ int ret = mapping->a_ops->writepage(page, wbc);
+ mapping_set_error(mapping, ret);
+ return ret;
+}
+
/**
* gfs2_ail1_start_one - Start I/O on a transaction
* @sdp: The superblock
@@ -131,7 +140,7 @@ __acquires(&sdp->sd_ail_lock)
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
- ret = filemap_fdatawrite_wbc(mapping, wbc);
+ ret = write_cache_pages(mapping, wbc, __gfs2_writepage, mapping);
if (need_resched()) {
blk_finish_plug(plug);
cond_resched();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 6ed728aae9a5..3c41b864ee5b 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -442,6 +442,12 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
struct buffer_head *bh;
int ty;
+ if (!ip->i_gl) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
gfs2_ail1_wipe(sdp, bstart, blen);
while (blen) {
ty = REMOVE_META;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b018957a1bb2..999cc146d708 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -379,6 +379,7 @@ out:
void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
{
+ const struct inode *inode = &ip->i_inode;
struct gfs2_dinode *str = buf;
str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
@@ -386,15 +387,15 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI);
str->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
- str->di_mode = cpu_to_be32(ip->i_inode.i_mode);
- str->di_uid = cpu_to_be32(i_uid_read(&ip->i_inode));
- str->di_gid = cpu_to_be32(i_gid_read(&ip->i_inode));
- str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
- str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
- str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
- str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
- str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
- str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
+ str->di_mode = cpu_to_be32(inode->i_mode);
+ str->di_uid = cpu_to_be32(i_uid_read(inode));
+ str->di_gid = cpu_to_be32(i_gid_read(inode));
+ str->di_nlink = cpu_to_be32(inode->i_nlink);
+ str->di_size = cpu_to_be64(i_size_read(inode));
+ str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode));
+ str->di_atime = cpu_to_be64(inode->i_atime.tv_sec);
+ str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec);
+ str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec);
str->di_goal_meta = cpu_to_be64(ip->i_goal);
str->di_goal_data = cpu_to_be64(ip->i_goal);
@@ -402,16 +403,16 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
str->di_flags = cpu_to_be32(ip->i_diskflags);
str->di_height = cpu_to_be16(ip->i_height);
- str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
+ str->di_payload_format = cpu_to_be32(S_ISDIR(inode->i_mode) &&
!(ip->i_diskflags & GFS2_DIF_EXHASH) ?
GFS2_FORMAT_DE : 0);
str->di_depth = cpu_to_be16(ip->i_depth);
str->di_entries = cpu_to_be32(ip->i_entries);
str->di_eattr = cpu_to_be64(ip->i_eattr);
- str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
- str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec);
- str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec);
+ str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
+ str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
+ str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
}
/**
@@ -475,6 +476,12 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
int need_endtrans = 0;
int ret;
+ if (unlikely(!ip->i_gl)) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
if (unlikely(gfs2_withdrawn(sdp)))
return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
@@ -927,8 +934,7 @@ static int gfs2_drop_inode(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
- if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) &&
- inode->i_nlink &&
+ if (inode->i_nlink &&
gfs2_holder_initialized(&ip->i_iopen_gh)) {
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1076,7 +1082,13 @@ static void gfs2_final_release_pages(struct gfs2_inode *ip)
struct inode *inode = &ip->i_inode;
struct gfs2_glock *gl = ip->i_gl;
- truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0);
+ if (unlikely(!gl)) {
+ /* This can only happen during incomplete inode creation. */
+ BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ return;
+ }
+
+ truncate_inode_pages(gfs2_glock2aspace(gl), 0);
truncate_inode_pages(&inode->i_data, 0);
if (atomic_read(&gl->gl_revokes) == 0) {
@@ -1218,10 +1230,8 @@ static enum dinode_demise evict_should_delete(struct inode *inode,
struct gfs2_sbd *sdp = sb->s_fs_info;
int ret;
- if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
- BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
+ if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags)))
goto should_delete;
- }
if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags))
return SHOULD_DEFER_EVICTION;
@@ -1294,13 +1304,22 @@ static int evict_unlinked_inode(struct inode *inode)
goto out;
}
- /* We're about to clear the bitmap for the dinode, but as soon as we
- do, gfs2_create_inode can create another inode at the same block
- location and try to set gl_object again. We clear gl_object here so
- that subsequent inode creates don't see an old gl_object. */
- glock_clear_object(ip->i_gl, ip);
+ if (ip->i_gl)
+ gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
+
+ /*
+ * As soon as we clear the bitmap for the dinode, gfs2_create_inode()
+ * can get called to recreate it, or even gfs2_inode_lookup() if the
+ * inode was recreated on another node in the meantime.
+ *
+ * However, inserting the new inode into the inode hash table will not
+ * succeed until the old inode is removed, and that only happens after
+ * ->evict_inode() returns. The new inode is attached to its inode and
+ * iopen glocks after inserting it into the inode hash table, so at
+ * that point we can be sure that both glocks are unused.
+ */
+
ret = gfs2_dinode_dealloc(ip);
- gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino);
out:
return ret;
}
@@ -1367,12 +1386,7 @@ static void gfs2_evict_inode(struct inode *inode)
struct gfs2_holder gh;
int ret;
- if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
- clear_inode(inode);
- return;
- }
-
- if (inode->i_nlink || sb_rdonly(sb))
+ if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr)
goto out;
gfs2_holder_mark_uninitialized(&gh);
@@ -1405,12 +1419,9 @@ out:
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
glock_clear_object(gl, ip);
- if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq(&ip->i_iopen_gh);
- }
gfs2_glock_hold(gl);
- gfs2_holder_uninit(&ip->i_iopen_gh);
+ ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_uninit(&ip->i_iopen_gh);
gfs2_glock_put_eventually(gl);
}
if (ip->i_gl) {
@@ -1429,6 +1440,7 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL);
if (!ip)
return NULL;
+ ip->i_no_addr = 0;
ip->i_flags = 0;
ip->i_gl = NULL;
gfs2_holder_mark_uninitialized(&ip->i_iopen_gh);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index f6a66050380e..518c0677e12a 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1412,11 +1412,13 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
ip->i_eattr = 0;
gfs2_add_inode_blocks(&ip->i_inode, -1);
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (!error) {
- gfs2_trans_add_meta(ip->i_gl, dibh);
- gfs2_dinode_out(ip, dibh->b_data);
- brelse(dibh);
+ if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (!error) {
+ gfs2_trans_add_meta(ip->i_gl, dibh);
+ gfs2_dinode_out(ip, dibh->b_data);
+ brelse(dibh);
+ }
}
gfs2_trans_end(sdp);
@@ -1445,14 +1447,16 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
if (error)
return error;
- error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
- if (error)
- goto out_quota;
-
- if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
- error = ea_dealloc_indirect(ip);
+ if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
+ error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
if (error)
goto out_quota;
+
+ if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
+ error = ea_dealloc_indirect(ip);
+ if (error)
+ goto out_quota;
+ }
}
error = ea_dealloc_block(ip);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 9c329a365e75..3a155c1d810e 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -458,15 +458,16 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
/* panic? */
return -EIO;
+ res = -EIO;
if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
- return -EIO;
+ goto out;
fd.search_key->cat = HFS_I(main_inode)->cat_key;
if (hfs_brec_find(&fd))
- /* panic? */
goto out;
if (S_ISDIR(main_inode->i_mode)) {
- WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir));
+ if (fd.entrylength < sizeof(struct hfs_cat_dir))
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
if (rec.type != HFS_CDR_DIR ||
@@ -479,6 +480,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
} else if (HFS_IS_RSRC(inode)) {
+ if (fd.entrylength < sizeof(struct hfs_cat_file))
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
hfs_inode_write_fork(inode, rec.file.RExtRec,
@@ -486,7 +489,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
} else {
- WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file));
+ if (fd.entrylength < sizeof(struct hfs_cat_file))
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
if (rec.type != HFS_CDR_FIL ||
@@ -503,9 +507,10 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
}
+ res = 0;
out:
hfs_find_exit(&fd);
- return 0;
+ return res;
}
static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 91ee0b308e13..356193e44cf0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -584,7 +584,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
}
-static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
+static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
size_t len, struct folio **foliop)
{
const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
@@ -618,6 +618,27 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
goto out_no_page;
}
+
+ /*
+ * Now we have a locked folio, before we do anything with it we need to
+ * check that the iomap we have cached is not stale. The inode extent
+ * mapping can change due to concurrent IO in flight (e.g.
+ * IOMAP_UNWRITTEN state can change and memory reclaim could have
+ * reclaimed a previously partially written page at this index after IO
+ * completion before this write reaches this file offset) and hence we
+ * could do the wrong thing here (zero a page range incorrectly or fail
+ * to zero) and corrupt data.
+ */
+ if (page_ops && page_ops->iomap_valid) {
+ bool iomap_valid = page_ops->iomap_valid(iter->inode,
+ &iter->iomap);
+ if (!iomap_valid) {
+ iter->iomap.flags |= IOMAP_F_STALE;
+ status = 0;
+ goto out_unlock;
+ }
+ }
+
if (pos + len > folio_pos(folio) + folio_size(folio))
len = folio_pos(folio) + folio_size(folio) - pos;
@@ -773,6 +794,8 @@ again:
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
break;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
page = folio_file_page(folio, pos >> PAGE_SHIFT);
if (mapping_writably_mapped(mapping))
@@ -832,6 +855,231 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+/*
+ * Scan the data range passed to us for dirty page cache folios. If we find a
+ * dirty folio, punch out the preceeding range and update the offset from which
+ * the next punch will start from.
+ *
+ * We can punch out storage reservations under clean pages because they either
+ * contain data that has been written back - in which case the delalloc punch
+ * over that range is a no-op - or they have been read faults in which case they
+ * contain zeroes and we can remove the delalloc backing range and any new
+ * writes to those pages will do the normal hole filling operation...
+ *
+ * This makes the logic simple: we only need to keep the delalloc extents only
+ * over the dirty ranges of the page cache.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+ * simplify range iterations.
+ */
+static int iomap_write_delalloc_scan(struct inode *inode,
+ loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
+ int (*punch)(struct inode *inode, loff_t offset, loff_t length))
+{
+ while (start_byte < end_byte) {
+ struct folio *folio;
+
+ /* grab locked page */
+ folio = filemap_lock_folio(inode->i_mapping,
+ start_byte >> PAGE_SHIFT);
+ if (!folio) {
+ start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
+ PAGE_SIZE;
+ continue;
+ }
+
+ /* if dirty, punch up to offset */
+ if (folio_test_dirty(folio)) {
+ if (start_byte > *punch_start_byte) {
+ int error;
+
+ error = punch(inode, *punch_start_byte,
+ start_byte - *punch_start_byte);
+ if (error) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return error;
+ }
+ }
+
+ /*
+ * Make sure the next punch start is correctly bound to
+ * the end of this data range, not the end of the folio.
+ */
+ *punch_start_byte = min_t(loff_t, end_byte,
+ folio_next_index(folio) << PAGE_SHIFT);
+ }
+
+ /* move offset to start of next folio in range */
+ start_byte = folio_next_index(folio) << PAGE_SHIFT;
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return 0;
+}
+
+/*
+ * Punch out all the delalloc blocks in the range given except for those that
+ * have dirty data still pending in the page cache - those are going to be
+ * written and so must still retain the delalloc backing for writeback.
+ *
+ * As we are scanning the page cache for data, we don't need to reimplement the
+ * wheel - mapping_seek_hole_data() does exactly what we need to identify the
+ * start and end of data ranges correctly even for sub-folio block sizes. This
+ * byte range based iteration is especially convenient because it means we
+ * don't have to care about variable size folios, nor where the start or end of
+ * the data range lies within a folio, if they lie within the same folio or even
+ * if there are multiple discontiguous data ranges within the folio.
+ *
+ * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
+ * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
+ * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
+ * date. A write page fault can then mark it dirty. If we then fail a write()
+ * beyond EOF into that up to date cached range, we allocate a delalloc block
+ * beyond EOF and then have to punch it out. Because the range is up to date,
+ * mapping_seek_hole_data() will return it, and we will skip the punch because
+ * the folio is dirty. THis is incorrect - we always need to punch out delalloc
+ * beyond EOF in this case as writeback will never write back and covert that
+ * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
+ * resulting in always punching out the range from the EOF to the end of the
+ * range the iomap spans.
+ *
+ * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
+ * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
+ * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
+ * returns the end of the data range (data_end). Using closed intervals would
+ * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
+ * the code to subtle off-by-one bugs....
+ */
+static int iomap_write_delalloc_release(struct inode *inode,
+ loff_t start_byte, loff_t end_byte,
+ int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+{
+ loff_t punch_start_byte = start_byte;
+ loff_t scan_end_byte = min(i_size_read(inode), end_byte);
+ int error = 0;
+
+ /*
+ * Lock the mapping to avoid races with page faults re-instantiating
+ * folios and dirtying them via ->page_mkwrite whilst we walk the
+ * cache and perform delalloc extent removal. Failing to do this can
+ * leave dirty pages with no space reservation in the cache.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ while (start_byte < scan_end_byte) {
+ loff_t data_end;
+
+ start_byte = mapping_seek_hole_data(inode->i_mapping,
+ start_byte, scan_end_byte, SEEK_DATA);
+ /*
+ * If there is no more data to scan, all that is left is to
+ * punch out the remaining range.
+ */
+ if (start_byte == -ENXIO || start_byte == scan_end_byte)
+ break;
+ if (start_byte < 0) {
+ error = start_byte;
+ goto out_unlock;
+ }
+ WARN_ON_ONCE(start_byte < punch_start_byte);
+ WARN_ON_ONCE(start_byte > scan_end_byte);
+
+ /*
+ * We find the end of this contiguous cached data range by
+ * seeking from start_byte to the beginning of the next hole.
+ */
+ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
+ scan_end_byte, SEEK_HOLE);
+ if (data_end < 0) {
+ error = data_end;
+ goto out_unlock;
+ }
+ WARN_ON_ONCE(data_end <= start_byte);
+ WARN_ON_ONCE(data_end > scan_end_byte);
+
+ error = iomap_write_delalloc_scan(inode, &punch_start_byte,
+ start_byte, data_end, punch);
+ if (error)
+ goto out_unlock;
+
+ /* The next data search starts at the end of this one. */
+ start_byte = data_end;
+ }
+
+ if (punch_start_byte < end_byte)
+ error = punch(inode, punch_start_byte,
+ end_byte - punch_start_byte);
+out_unlock:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return error;
+}
+
+/*
+ * When a short write occurs, the filesystem may need to remove reserved space
+ * that was allocated in ->iomap_begin from it's ->iomap_end method. For
+ * filesystems that use delayed allocation, we need to punch out delalloc
+ * extents from the range that are not dirty in the page cache. As the write can
+ * race with page faults, there can be dirty pages over the delalloc extent
+ * outside the range of a short write but still within the delalloc extent
+ * allocated for this iomap.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+ * simplify range iterations.
+ *
+ * The punch() callback *must* only punch delalloc extents in the range passed
+ * to it. It must skip over all other types of extents in the range and leave
+ * them completely unchanged. It must do this punch atomically with respect to
+ * other extent modifications.
+ *
+ * The punch() callback may be called with a folio locked to prevent writeback
+ * extent allocation racing at the edge of the range we are currently punching.
+ * The locked folio may or may not cover the range being punched, so it is not
+ * safe for the punch() callback to lock folios itself.
+ *
+ * Lock order is:
+ *
+ * inode->i_rwsem (shared or exclusive)
+ * inode->i_mapping->invalidate_lock (exclusive)
+ * folio_lock()
+ * ->punch
+ * internal filesystem allocation lock
+ */
+int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+ struct iomap *iomap, loff_t pos, loff_t length,
+ ssize_t written,
+ int (*punch)(struct inode *inode, loff_t pos, loff_t length))
+{
+ loff_t start_byte;
+ loff_t end_byte;
+ int blocksize = i_blocksize(inode);
+
+ if (iomap->type != IOMAP_DELALLOC)
+ return 0;
+
+ /* If we didn't reserve the blocks, we're not allowed to punch them. */
+ if (!(iomap->flags & IOMAP_F_NEW))
+ return 0;
+
+ /*
+ * start_byte refers to the first unused block after a short write. If
+ * nothing was written, round offset down to point at the first block in
+ * the range.
+ */
+ if (unlikely(!written))
+ start_byte = round_down(pos, blocksize);
+ else
+ start_byte = round_up(pos + written, blocksize);
+ end_byte = round_up(pos + length, blocksize);
+
+ /* Nothing to do if we've written the entire delalloc extent */
+ if (start_byte >= end_byte)
+ return 0;
+
+ return iomap_write_delalloc_release(inode, start_byte, end_byte,
+ punch);
+}
+EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
+
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
@@ -856,6 +1104,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
status = iomap_write_begin(iter, pos, bytes, &folio);
if (unlikely(status))
return status;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
status = iomap_write_end(iter, pos, bytes, bytes, folio);
if (WARN_ON_ONCE(status == 0))
@@ -911,6 +1161,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
status = iomap_write_begin(iter, pos, bytes, &folio);
if (status)
return status;
+ if (iter->iomap.flags & IOMAP_F_STALE)
+ break;
offset = offset_in_folio(folio, pos);
if (bytes > folio_size(folio) - offset)
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index a1c7592d2ade..79a0614eaab7 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -7,12 +7,28 @@
#include <linux/iomap.h>
#include "trace.h"
+/*
+ * Advance to the next range we need to map.
+ *
+ * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
+ * processed - it was aborted because the extent the iomap spanned may have been
+ * changed during the operation. In this case, the iteration behaviour is to
+ * remap the unprocessed range of the iter, and that means we may need to remap
+ * even when we've made no progress (i.e. iter->processed = 0). Hence the
+ * "finished iterating" case needs to distinguish between
+ * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
+ * need to remap the entire remaining range.
+ */
static inline int iomap_iter_advance(struct iomap_iter *iter)
{
+ bool stale = iter->iomap.flags & IOMAP_F_STALE;
+
/* handle the previous iteration (if any) */
if (iter->iomap.length) {
- if (iter->processed <= 0)
+ if (iter->processed < 0)
return iter->processed;
+ if (!iter->processed && !stale)
+ return 0;
if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
return -EIO;
iter->pos += iter->processed;
@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
WARN_ON_ONCE(iter->iomap.offset > iter->pos);
WARN_ON_ONCE(iter->iomap.length == 0);
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
+ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
if (iter->srcmap.type != IOMAP_HOLE)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index f33b3baad07c..935ef8cb02b2 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -125,9 +125,9 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
* kn_to: /n1/n2/n3 [depth=3]
* result: /../..
*
- * [3] when @kn_to is NULL result will be "(null)"
+ * [3] when @kn_to is %NULL result will be "(null)"
*
- * Returns the length of the full path. If the full length is equal to or
+ * Return: the length of the full path. If the full length is equal to or
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -185,10 +185,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
* @buflen: size of @buf
*
* Copies the name of @kn into @buf of @buflen bytes. The behavior is
- * similar to strlcpy(). It returns the length of @kn's name and if @buf
- * isn't long enough, it's filled upto @buflen-1 and nul terminated.
+ * similar to strlcpy().
*
- * Fills buffer with "(null)" if @kn is NULL.
+ * Fills buffer with "(null)" if @kn is %NULL.
+ *
+ * Return: the length of @kn's name and if @buf isn't long enough,
+ * it's filled up to @buflen-1 and nul terminated.
*
* This function can be called from any context.
*/
@@ -215,7 +217,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
- * Returns the length of the full path. If the full length is equal to or
+ * Return: the length of the full path. If the full length is equal to or
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -287,6 +289,8 @@ out:
*
* Determines @kn's parent, pins and returns it. This function can be
* called from any context.
+ *
+ * Return: parent node of @kn
*/
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
@@ -302,11 +306,11 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
}
/**
- * kernfs_name_hash
+ * kernfs_name_hash - calculate hash of @ns + @name
* @name: Null terminated string to hash
* @ns: Namespace tag to hash
*
- * Returns 31 bit hash of ns + name (so it fits in an off_t )
+ * Return: 31-bit hash of ns + name (so it fits in an off_t)
*/
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
@@ -354,8 +358,8 @@ static int kernfs_sd_compare(const struct kernfs_node *left,
* Locking:
* kernfs_rwsem held exclusive
*
- * RETURNS:
- * 0 on susccess -EEXIST on failure.
+ * Return:
+ * %0 on success, -EEXIST on failure.
*/
static int kernfs_link_sibling(struct kernfs_node *kn)
{
@@ -394,8 +398,10 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
* @kn: kernfs_node of interest
*
* Try to unlink @kn from its sibling rbtree which starts from
- * kn->parent->dir.children. Returns %true if @kn was actually
- * removed, %false if @kn wasn't on the rbtree.
+ * kn->parent->dir.children.
+ *
+ * Return: %true if @kn was actually removed,
+ * %false if @kn wasn't on the rbtree.
*
* Locking:
* kernfs_rwsem held exclusive
@@ -419,10 +425,10 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn)
* @kn: kernfs_node to get an active reference to
*
* Get an active reference of @kn. This function is noop if @kn
- * is NULL.
+ * is %NULL.
*
- * RETURNS:
- * Pointer to @kn on success, NULL on failure.
+ * Return:
+ * Pointer to @kn on success, %NULL on failure.
*/
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
{
@@ -442,7 +448,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
* @kn: kernfs_node to put an active reference to
*
* Put an active reference to @kn. This function is noop if @kn
- * is NULL.
+ * is %NULL.
*/
void kernfs_put_active(struct kernfs_node *kn)
{
@@ -464,7 +470,7 @@ void kernfs_put_active(struct kernfs_node *kn)
* kernfs_drain - drain kernfs_node
* @kn: kernfs_node to drain
*
- * Drain existing usages and nuke all existing mmaps of @kn. Mutiple
+ * Drain existing usages and nuke all existing mmaps of @kn. Multiple
* removers may invoke this function concurrently on @kn and all will
* return after draining is complete.
*/
@@ -577,7 +583,7 @@ EXPORT_SYMBOL_GPL(kernfs_put);
* kernfs_node_from_dentry - determine kernfs_node associated with a dentry
* @dentry: the dentry in question
*
- * Return the kernfs_node associated with @dentry. If @dentry is not a
+ * Return: the kernfs_node associated with @dentry. If @dentry is not a
* kernfs one, %NULL is returned.
*
* While the returned kernfs_node will stay accessible as long as @dentry
@@ -684,8 +690,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
* @id's lower 32bits encode ino and upper gen. If the gen portion is
* zero, all generations are matched.
*
- * RETURNS:
- * NULL on failure. Return a kernfs node with reference counter incremented
+ * Return: %NULL on failure,
+ * otherwise a kernfs node with reference counter incremented.
*/
struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
u64 id)
@@ -733,8 +739,8 @@ err_unlock:
* function increments nlink of the parent's inode if @kn is a
* directory and link into the children list of the parent.
*
- * RETURNS:
- * 0 on success, -EEXIST if entry with the given name already
+ * Return:
+ * %0 on success, -EEXIST if entry with the given name already
* exists.
*/
int kernfs_add_one(struct kernfs_node *kn)
@@ -797,8 +803,9 @@ out_unlock:
* @name: name to look for
* @ns: the namespace tag to use
*
- * Look for kernfs_node with name @name under @parent. Returns pointer to
- * the found kernfs_node on success, %NULL on failure.
+ * Look for kernfs_node with name @name under @parent.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
const unsigned char *name,
@@ -871,8 +878,9 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
* @ns: the namespace tag to use
*
* Look for kernfs_node with name @name under @parent and get a reference
- * if found. This function may sleep and returns pointer to the found
- * kernfs_node on success, %NULL on failure.
+ * if found. This function may sleep.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
const char *name, const void *ns)
@@ -896,8 +904,9 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
* @ns: the namespace tag to use
*
* Look for kernfs_node with path @path under @parent and get a reference
- * if found. This function may sleep and returns pointer to the found
- * kernfs_node on success, %NULL on failure.
+ * if found. This function may sleep.
+ *
+ * Return: pointer to the found kernfs_node on success, %NULL on failure.
*/
struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
const char *path, const void *ns)
@@ -919,7 +928,7 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
* @flags: KERNFS_ROOT_* flags
* @priv: opaque data associated with the new directory
*
- * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * Return: the root of the new hierarchy on success, ERR_PTR() value on
* failure.
*/
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
@@ -991,6 +1000,8 @@ void kernfs_destroy_root(struct kernfs_root *root)
/**
* kernfs_root_to_node - return the kernfs_node associated with a kernfs_root
* @root: root to use to lookup
+ *
+ * Return: @root's kernfs_node
*/
struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
{
@@ -1007,7 +1018,7 @@ struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root)
* @priv: opaque data associated with the new directory
* @ns: optional namespace tag of the directory
*
- * Returns the created node on success, ERR_PTR() value on failure.
+ * Return: the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
const char *name, umode_t mode,
@@ -1041,7 +1052,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
* @parent: parent in which to create a new directory
* @name: name of the new directory
*
- * Returns the created node on success, ERR_PTR() value on failure.
+ * Return: the created node on success, ERR_PTR() value on failure.
*/
struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
const char *name)
@@ -1083,20 +1094,30 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
/* If the kernfs parent node has changed discard and
* proceed to ->lookup.
+ *
+ * There's nothing special needed here when getting the
+ * dentry parent, even if a concurrent rename is in
+ * progress. That's because the dentry is negative so
+ * it can only be the target of the rename and it will
+ * be doing a d_move() not a replace. Consequently the
+ * dentry d_parent won't change over the d_move().
+ *
+ * Also kernfs negative dentries transitioning from
+ * negative to positive during revalidate won't happen
+ * because they are invalidated on containing directory
+ * changes and the lookup re-done so that a new positive
+ * dentry can be properly created.
*/
- spin_lock(&dentry->d_lock);
+ root = kernfs_root_from_sb(dentry->d_sb);
+ down_read(&root->kernfs_rwsem);
parent = kernfs_dentry_node(dentry->d_parent);
if (parent) {
- spin_unlock(&dentry->d_lock);
- root = kernfs_root(parent);
- down_read(&root->kernfs_rwsem);
if (kernfs_dir_changed(parent, dentry)) {
up_read(&root->kernfs_rwsem);
return 0;
}
- up_read(&root->kernfs_rwsem);
- } else
- spin_unlock(&dentry->d_lock);
+ }
+ up_read(&root->kernfs_rwsem);
/* The kernfs parent node hasn't changed, leave the
* dentry negative and return success.
@@ -1290,6 +1311,8 @@ static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
* Find the next descendant to visit for post-order traversal of @root's
* descendants. @root is included in the iteration and the last node to be
* visited.
+ *
+ * Return: the next descendant to visit or %NULL when done.
*/
static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
struct kernfs_node *root)
@@ -1553,6 +1576,8 @@ void kernfs_unbreak_active_protection(struct kernfs_node *kn)
* the whole kernfs_ops which won the arbitration. This can be used to
* guarantee, for example, all concurrent writes to a "delete" file to
* finish only after the whole operation is complete.
+ *
+ * Return: %true if @kn is removed by this call, otherwise %false.
*/
bool kernfs_remove_self(struct kernfs_node *kn)
{
@@ -1613,7 +1638,8 @@ bool kernfs_remove_self(struct kernfs_node *kn)
* @ns: namespace tag of the kernfs_node to remove
*
* Look for the kernfs_node with @name and @ns under @parent and remove it.
- * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ *
+ * Return: %0 on success, -ENOENT if such entry doesn't exist.
*/
int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
const void *ns)
@@ -1651,6 +1677,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
* @new_parent: new parent to put @sd under
* @new_name: new name
* @new_ns: new namespace tag
+ *
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name, const void *new_ns)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 9ab6c92e02da..e4a50e4ff0d2 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -33,7 +33,7 @@ struct kernfs_open_node {
* pending queue is implemented as a singly linked list of kernfs_nodes.
* The list is terminated with the self pointer so that whether a
* kernfs_node is on the list or not can be determined by testing the next
- * pointer for NULL.
+ * pointer for %NULL.
*/
#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
@@ -59,8 +59,10 @@ static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn)
}
/**
- * of_on - Return the kernfs_open_node of the specified kernfs_open_file
- * @of: taret kernfs_open_file
+ * of_on - Get the kernfs_open_node of the specified kernfs_open_file
+ * @of: target kernfs_open_file
+ *
+ * Return: the kernfs_open_node of the kernfs_open_file
*/
static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
{
@@ -82,6 +84,8 @@ static struct kernfs_open_node *of_on(struct kernfs_open_file *of)
* outside RCU read-side critical section.
*
* The caller needs to make sure that kernfs_open_file_mutex is held.
+ *
+ * Return: @kn->attr.open when kernfs_open_file_mutex is held.
*/
static struct kernfs_open_node *
kernfs_deref_open_node_locked(struct kernfs_node *kn)
@@ -548,11 +552,11 @@ out_unlock:
* If @kn->attr.open exists, increment its reference count; otherwise,
* create one. @of is chained to the files list.
*
- * LOCKING:
+ * Locking:
* Kernel thread context (may sleep).
*
- * RETURNS:
- * 0 on success, -errno on failure.
+ * Return:
+ * %0 on success, -errno on failure.
*/
static int kernfs_get_open_node(struct kernfs_node *kn,
struct kernfs_open_file *of)
@@ -1024,7 +1028,7 @@ const struct file_operations kernfs_file_fops = {
* @ns: optional namespace tag of the file
* @key: lockdep key for the file's active_ref, %NULL to disable lockdep
*
- * Returns the created node on success, ERR_PTR() value on error.
+ * Return: the created node on success, ERR_PTR() value on error.
*/
struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
const char *name,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 3d783d80f5da..eac0f210299a 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -94,7 +94,7 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
* @kn: target node
* @iattr: iattr to set
*
- * Returns 0 on success, -errno on failure.
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
{
@@ -190,10 +190,8 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns,
struct kernfs_root *root = kernfs_root(kn);
down_read(&root->kernfs_rwsem);
- spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
generic_fillattr(&init_user_ns, inode, stat);
- spin_unlock(&inode->i_lock);
up_read(&root->kernfs_rwsem);
return 0;
@@ -241,11 +239,11 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
* allocated and basics are initialized. New inode is returned
* locked.
*
- * LOCKING:
+ * Locking:
* Kernel thread context (may sleep).
*
- * RETURNS:
- * Pointer to allocated inode on success, NULL on failure.
+ * Return:
+ * Pointer to allocated inode on success, %NULL on failure.
*/
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
{
@@ -288,10 +286,8 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns,
root = kernfs_root(kn);
down_read(&root->kernfs_rwsem);
- spin_lock(&inode->i_lock);
kernfs_refresh_inode(kn, inode);
ret = generic_permission(&init_user_ns, inode, mask);
- spin_unlock(&inode->i_lock);
up_read(&root->kernfs_rwsem);
return ret;
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index fc5821effd97..9046d9f39e63 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -58,7 +58,7 @@ struct kernfs_root {
* kernfs_root - find out the kernfs_root a kernfs_node belongs to
* @kn: kernfs_node of interest
*
- * Return the kernfs_root @kn belongs to.
+ * Return: the kernfs_root @kn belongs to.
*/
static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
{
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d0859f72d2d6..e08e8d999807 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -153,7 +153,7 @@ static const struct export_operations kernfs_export_ops = {
* kernfs_root_from_sb - determine kernfs_root associated with a super_block
* @sb: the super_block in question
*
- * Return the kernfs_root associated with @sb. If @sb is not a kernfs one,
+ * Return: the kernfs_root associated with @sb. If @sb is not a kernfs one,
* %NULL is returned.
*/
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
@@ -167,7 +167,7 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
* find the next ancestor in the path down to @child, where @parent was the
* ancestor whose descendant we want to find.
*
- * Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root
+ * Say the path is /a/b/c/d. @child is d, @parent is %NULL. We return the root
* node. If @parent is b, then we return the node for c.
* Passing in d as @parent is not ok.
*/
@@ -192,6 +192,8 @@ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
* kernfs_node_dentry - get a dentry for the given kernfs_node
* @kn: kernfs_node for which a dentry is needed
* @sb: the kernfs super_block
+ *
+ * Return: the dentry pointer
*/
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
struct super_block *sb)
@@ -296,7 +298,7 @@ static int kernfs_set_super(struct super_block *sb, struct fs_context *fc)
* kernfs_super_ns - determine the namespace tag of a kernfs super_block
* @sb: super_block of interest
*
- * Return the namespace tag associated with kernfs super_block @sb.
+ * Return: the namespace tag associated with kernfs super_block @sb.
*/
const void *kernfs_super_ns(struct super_block *sb)
{
@@ -313,6 +315,8 @@ const void *kernfs_super_ns(struct super_block *sb)
* implementation, which should set the specified ->@fs_type and ->@flags, and
* specify the hierarchy and namespace tag to mount via ->@root and ->@ns,
* respectively.
+ *
+ * Return: %0 on success, -errno on failure.
*/
int kernfs_get_tree(struct fs_context *fc)
{
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 0ab13824822f..45371a70caa7 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -19,7 +19,7 @@
* @name: name of the symlink
* @target: target node for the symlink to point to
*
- * Returns the created node on success, ERR_PTR() value on error.
+ * Return: the created node on success, ERR_PTR() value on error.
* Ownership of the link matches ownership of the target.
*/
struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c
index 2a39ffb8423b..6e61b5bc7d86 100644
--- a/fs/ksmbd/auth.c
+++ b/fs/ksmbd/auth.c
@@ -322,7 +322,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
dn_off = le32_to_cpu(authblob->DomainName.BufferOffset);
dn_len = le16_to_cpu(authblob->DomainName.Length);
- if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len)
+ if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len ||
+ nt_len < CIFS_ENCPWD_SIZE)
return -EINVAL;
/* TODO : use domain name that imported from configuration file */
diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c
index 12be8386446a..fd0a288af299 100644
--- a/fs/ksmbd/connection.c
+++ b/fs/ksmbd/connection.c
@@ -316,9 +316,12 @@ int ksmbd_conn_handler_loop(void *p)
/* 4 for rfc1002 length field */
size = pdu_size + 4;
- conn->request_buf = kvmalloc(size, GFP_KERNEL);
+ conn->request_buf = kvmalloc(size,
+ GFP_KERNEL |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
if (!conn->request_buf)
- continue;
+ break;
memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf));
if (!ksmbd_smb_request(conn))
diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h
index ff07c67f4565..b6bd8311e6b4 100644
--- a/fs/ksmbd/ksmbd_netlink.h
+++ b/fs/ksmbd/ksmbd_netlink.h
@@ -74,6 +74,7 @@ struct ksmbd_heartbeat {
#define KSMBD_GLOBAL_FLAG_SMB2_LEASES BIT(0)
#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1)
#define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2)
+#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF BIT(3)
/*
* IPC request for ksmbd server startup
diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c
index 3fa2139a0b30..92b1603b5abe 100644
--- a/fs/ksmbd/mgmt/user_session.c
+++ b/fs/ksmbd/mgmt/user_session.c
@@ -108,15 +108,17 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
entry->method = method;
entry->id = ksmbd_ipc_id_alloc();
if (entry->id < 0)
- goto error;
+ goto free_entry;
resp = ksmbd_rpc_open(sess, entry->id);
if (!resp)
- goto error;
+ goto free_id;
kvfree(resp);
return entry->id;
-error:
+free_id:
+ ksmbd_rpc_id_free(entry->id);
+free_entry:
list_del(&entry->list);
kfree(entry);
return -EINVAL;
diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c
index a0d635304754..394b6ceac431 100644
--- a/fs/ksmbd/server.c
+++ b/fs/ksmbd/server.c
@@ -432,11 +432,9 @@ static ssize_t stats_show(struct class *class, struct class_attribute *attr,
"reset",
"shutdown"
};
-
- ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version,
- state[server_conf.state], server_conf.tcp_port,
- server_conf.ipc_last_active / HZ);
- return sz;
+ return sysfs_emit(buf, "%d %s %d %lu\n", stats_version,
+ state[server_conf.state], server_conf.tcp_port,
+ server_conf.ipc_last_active / HZ);
}
static ssize_t kill_server_store(struct class *class,
@@ -468,19 +466,13 @@ static ssize_t debug_show(struct class *class, struct class_attribute *attr,
for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) {
if ((ksmbd_debug_types >> i) & 1) {
- pos = scnprintf(buf + sz,
- PAGE_SIZE - sz,
- "[%s] ",
- debug_type_strings[i]);
+ pos = sysfs_emit_at(buf, sz, "[%s] ", debug_type_strings[i]);
} else {
- pos = scnprintf(buf + sz,
- PAGE_SIZE - sz,
- "%s ",
- debug_type_strings[i]);
+ pos = sysfs_emit_at(buf, sz, "%s ", debug_type_strings[i]);
}
sz += pos;
}
- sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+ sz += sysfs_emit_at(buf, sz, "\n");
return sz;
}
diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c
index ab23da2120b9..e401302478c3 100644
--- a/fs/ksmbd/smb2ops.c
+++ b/fs/ksmbd/smb2ops.c
@@ -247,8 +247,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
- if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION &&
- conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+ (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
@@ -271,6 +272,11 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+ (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+ conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
+ conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c
index 9306e10753f9..38fbda52e06f 100644
--- a/fs/ksmbd/smb2pdu.c
+++ b/fs/ksmbd/smb2pdu.c
@@ -903,7 +903,7 @@ static void decode_encrypt_ctxt(struct ksmbd_conn *conn,
return;
}
- if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION))
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF)
return;
for (i = 0; i < cph_cnt; i++) {
@@ -1508,7 +1508,8 @@ static int ntlm_authenticate(struct ksmbd_work *work)
return -EINVAL;
}
sess->enc = true;
- rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
/*
* signing is disable if encryption is enable
* on this session
@@ -1599,7 +1600,8 @@ static int krb5_authenticate(struct ksmbd_work *work)
return -EINVAL;
}
sess->enc = true;
- rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
+ if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)
+ rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE;
sess->sign = false;
}
@@ -1926,13 +1928,13 @@ int smb2_tree_connect(struct ksmbd_work *work)
if (conn->posix_ext_supported)
status.tree_conn->posix_extensions = true;
-out_err1:
rsp->StructureSize = cpu_to_le16(16);
+ inc_rfc1001_len(work->response_buf, 16);
+out_err1:
rsp->Capabilities = 0;
rsp->Reserved = 0;
/* default manual caching */
rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING;
- inc_rfc1001_len(work->response_buf, 16);
if (!IS_ERR(treename))
kfree(treename);
@@ -1965,6 +1967,9 @@ out_err1:
rsp->hdr.Status = STATUS_ACCESS_DENIED;
}
+ if (status.ret != KSMBD_TREE_CONN_STATUS_OK)
+ smb2_set_err_rsp(work);
+
return rc;
}
@@ -3438,7 +3443,7 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
goto free_conv_name;
}
- struct_sz = readdir_info_level_struct_sz(info_level) - 1 + conv_len;
+ struct_sz = readdir_info_level_struct_sz(info_level) + conv_len;
next_entry_offset = ALIGN(struct_sz, KSMBD_DIR_INFO_ALIGNMENT);
d_info->last_entry_off_align = next_entry_offset - struct_sz;
@@ -3690,7 +3695,7 @@ static int reserve_populate_dentry(struct ksmbd_dir_info *d_info,
return -EOPNOTSUPP;
conv_len = (d_info->name_len + 1) * 2;
- next_entry_offset = ALIGN(struct_sz - 1 + conv_len,
+ next_entry_offset = ALIGN(struct_sz + conv_len,
KSMBD_DIR_INFO_ALIGNMENT);
if (next_entry_offset > d_info->out_buf_len) {
@@ -6751,7 +6756,7 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags)
case SMB2_LOCKFLAG_UNLOCK:
ksmbd_debug(SMB, "received unlock request\n");
flock->fl_type = F_UNLCK;
- cmd = 0;
+ cmd = F_SETLK;
break;
}
@@ -6855,6 +6860,7 @@ int smb2_lock(struct ksmbd_work *work)
if (lock_start > U64_MAX - lock_length) {
pr_err("Invalid lock range requested\n");
rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+ locks_free_lock(flock);
goto out;
}
@@ -6874,6 +6880,7 @@ int smb2_lock(struct ksmbd_work *work)
"the end offset(%llx) is smaller than the start offset(%llx)\n",
flock->fl_end, flock->fl_start);
rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE;
+ locks_free_lock(flock);
goto out;
}
@@ -6885,6 +6892,7 @@ int smb2_lock(struct ksmbd_work *work)
flock->fl_type != F_UNLCK) {
pr_err("conflict two locks in one request\n");
err = -EINVAL;
+ locks_free_lock(flock);
goto out;
}
}
@@ -6893,6 +6901,7 @@ int smb2_lock(struct ksmbd_work *work)
smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list);
if (!smb_lock) {
err = -EINVAL;
+ locks_free_lock(flock);
goto out;
}
}
@@ -7129,7 +7138,7 @@ out:
rlock->fl_start = smb_lock->start;
rlock->fl_end = smb_lock->end;
- rc = vfs_lock_file(filp, 0, rlock, NULL);
+ rc = vfs_lock_file(filp, F_SETLK, rlock, NULL);
if (rc)
pr_err("rollback unlock fail : %d\n", rc);
diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h
index 092fdd3f8750..aa5dbe54f5a1 100644
--- a/fs/ksmbd/smb2pdu.h
+++ b/fs/ksmbd/smb2pdu.h
@@ -443,7 +443,7 @@ struct smb2_posix_info {
/* SidBuffer contain two sids (UNIX user sid(16), UNIX group sid(16)) */
u8 SidBuffer[32];
__le32 name_len;
- u8 name[1];
+ u8 name[];
/*
* var sized owner SID
* var sized group SID
diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c
index d96da872d70a..2a4fbbd55b91 100644
--- a/fs/ksmbd/smb_common.c
+++ b/fs/ksmbd/smb_common.c
@@ -623,7 +623,7 @@ int ksmbd_override_fsids(struct ksmbd_work *work)
if (share->force_gid != KSMBD_SHARE_INVALID_GID)
gid = share->force_gid;
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h
index 318c16fa81da..e663ab9ea759 100644
--- a/fs/ksmbd/smb_common.h
+++ b/fs/ksmbd/smb_common.h
@@ -277,14 +277,14 @@ struct file_directory_info {
__le64 AllocationSize;
__le32 ExtFileAttributes;
__le32 FileNameLength;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x101 FF resp data */
struct file_names_info {
__le32 NextEntryOffset;
__u32 FileIndex;
__le32 FileNameLength;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0xc FF resp data */
struct file_full_directory_info {
@@ -299,7 +299,7 @@ struct file_full_directory_info {
__le32 ExtFileAttributes;
__le32 FileNameLength;
__le32 EaSize;
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x102 FF resp */
struct file_both_directory_info {
@@ -317,7 +317,7 @@ struct file_both_directory_info {
__u8 ShortNameLength;
__u8 Reserved;
__u8 ShortName[24];
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x104 FFrsp data */
struct file_id_both_directory_info {
@@ -337,7 +337,7 @@ struct file_id_both_directory_info {
__u8 ShortName[24];
__le16 Reserved2;
__le64 UniqueId;
- char FileName[1];
+ char FileName[];
} __packed;
struct file_id_full_dir_info {
@@ -354,7 +354,7 @@ struct file_id_full_dir_info {
__le32 EaSize; /* EA size */
__le32 Reserved;
__le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/
- char FileName[1];
+ char FileName[];
} __packed; /* level 0x105 FF rsp data */
struct smb_version_values {
diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c
index 63d55f543bd2..4c6bd0b69979 100644
--- a/fs/ksmbd/transport_tcp.c
+++ b/fs/ksmbd/transport_tcp.c
@@ -295,6 +295,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
struct msghdr ksmbd_msg;
struct kvec *iov;
struct ksmbd_conn *conn = KSMBD_TRANS(t)->conn;
+ int max_retry = 2;
iov = get_conn_iovec(t, nr_segs);
if (!iov)
@@ -321,9 +322,11 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
} else if (conn->status == KSMBD_SESS_NEED_RECONNECT) {
total_read = -EAGAIN;
break;
- } else if (length == -ERESTARTSYS || length == -EAGAIN) {
+ } else if ((length == -ERESTARTSYS || length == -EAGAIN) &&
+ max_retry) {
usleep_range(1000, 2000);
length = 0;
+ max_retry--;
continue;
} else if (length <= 0) {
total_read = -EAGAIN;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ea1ceffa1d3a..f7e4a88d5d92 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2957,12 +2957,14 @@ static u64 nfs_access_login_time(const struct task_struct *task,
const struct cred *cred)
{
const struct task_struct *parent;
+ const struct cred *pcred;
u64 ret;
rcu_read_lock();
for (;;) {
parent = rcu_dereference(task->real_parent);
- if (parent == task || cred_fscmp(parent->cred, cred) != 0)
+ pcred = rcu_dereference(parent->cred);
+ if (parent == task || cred_fscmp(pcred, cred) != 0)
break;
task = parent;
}
@@ -3023,6 +3025,7 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre
* but do it without locking.
*/
struct nfs_inode *nfsi = NFS_I(inode);
+ u64 login_time = nfs_access_login_time(current, cred);
struct nfs_access_entry *cache;
int err = -ECHILD;
struct list_head *lh;
@@ -3037,6 +3040,8 @@ static int nfs_access_get_cached_rcu(struct inode *inode, const struct cred *cre
cache = NULL;
if (cache == NULL)
goto out;
+ if ((s64)(login_time - cache->timestamp) > 0)
+ goto out;
if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
goto out;
*mask = cache->mask;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index ad34a33b0737..4974cd18ca46 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -783,6 +783,12 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
return &fl->generic_hdr;
}
+static bool
+filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg)
+{
+ return flseg->num_fh > 1;
+}
+
/*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests()
*
@@ -803,6 +809,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
size = pnfs_generic_pg_test(pgio, prev, req);
if (!size)
return 0;
+ else if (!filelayout_lseg_is_striped(FILELAYOUT_LSEG(pgio->pg_lseg)))
+ return size;
/* see if req and prev are in the same stripe */
if (prev) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 1ec79ccf89ad..7deb3cd76abe 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -493,10 +493,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
gid = make_kgid(&init_user_ns, id);
if (gfp_flags & __GFP_FS)
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
else {
unsigned int nofs_flags = memalloc_nofs_save();
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
memalloc_nofs_restore(nofs_flags);
}
rc = -ENOMEM;
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index e3fdd2f45b01..25a7c771cfd8 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -203,7 +203,7 @@ int nfs_idmap_init(void)
printk(KERN_NOTICE "NFS: Registering the %s key type\n",
key_type_id_resolver.name);
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 26f8ece2a997..0cbcd2dfa732 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -26,7 +26,7 @@ static void nfs_netns_object_release(struct kobject *kobj)
}
static const struct kobj_ns_type_operations *nfs_netns_object_child_ns_type(
- struct kobject *kobj)
+ const struct kobject *kobj)
{
return &net_ns_type_operations;
}
@@ -130,7 +130,7 @@ static void nfs_netns_client_release(struct kobject *kobj)
kfree(c);
}
-static const void *nfs_netns_client_namespace(struct kobject *kobj)
+static const void *nfs_netns_client_namespace(const struct kobject *kobj)
{
return container_of(kobj, struct nfs_netns_client, kobject)->net;
}
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 1998b4d5f692..c0950edb26b0 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -324,8 +324,7 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
if (key->gc)
__set_bit(NFSD_FILE_GC, &nf->nf_flags);
nf->nf_inode = key->inode;
- /* nf_ref is pre-incremented for hash table */
- refcount_set(&nf->nf_ref, 2);
+ refcount_set(&nf->nf_ref, 1);
nf->nf_may = key->need;
nf->nf_mark = NULL;
}
@@ -377,24 +376,35 @@ nfsd_file_unhash(struct nfsd_file *nf)
return false;
}
-static bool
+static void
nfsd_file_free(struct nfsd_file *nf)
{
s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
- bool flush = false;
trace_nfsd_file_free(nf);
this_cpu_inc(nfsd_file_releases);
this_cpu_add(nfsd_file_total_age, age);
+ nfsd_file_unhash(nf);
+
+ /*
+ * We call fsync here in order to catch writeback errors. It's not
+ * strictly required by the protocol, but an nfsd_file could get
+ * evicted from the cache before a COMMIT comes in. If another
+ * task were to open that file in the interim and scrape the error,
+ * then the client may never see it. By calling fsync here, we ensure
+ * that writeback happens before the entry is freed, and that any
+ * errors reported result in the write verifier changing.
+ */
+ nfsd_file_fsync(nf);
+
if (nf->nf_mark)
nfsd_file_mark_put(nf->nf_mark);
if (nf->nf_file) {
get_file(nf->nf_file);
filp_close(nf->nf_file, NULL);
fput(nf->nf_file);
- flush = true;
}
/*
@@ -402,10 +412,9 @@ nfsd_file_free(struct nfsd_file *nf)
* WARN and leak it to preserve system stability.
*/
if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
- return flush;
+ return;
call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
- return flush;
}
static bool
@@ -421,17 +430,23 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
}
-static void nfsd_file_lru_add(struct nfsd_file *nf)
+static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru))
+ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_add(nf);
+ return true;
+ }
+ return false;
}
-static void nfsd_file_lru_remove(struct nfsd_file *nf)
+static bool nfsd_file_lru_remove(struct nfsd_file *nf)
{
- if (list_lru_del(&nfsd_file_lru, &nf->nf_lru))
+ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_del(nf);
+ return true;
+ }
+ return false;
}
struct nfsd_file *
@@ -442,86 +457,60 @@ nfsd_file_get(struct nfsd_file *nf)
return NULL;
}
-static void
-nfsd_file_unhash_and_queue(struct nfsd_file *nf, struct list_head *dispose)
-{
- trace_nfsd_file_unhash_and_queue(nf);
- if (nfsd_file_unhash(nf)) {
- /* caller must call nfsd_file_dispose_list() later */
- nfsd_file_lru_remove(nf);
- list_add(&nf->nf_lru, dispose);
- }
-}
-
-static void
-nfsd_file_put_noref(struct nfsd_file *nf)
-{
- trace_nfsd_file_put(nf);
-
- if (refcount_dec_and_test(&nf->nf_ref)) {
- WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
- nfsd_file_lru_remove(nf);
- nfsd_file_free(nf);
- }
-}
-
-static void
-nfsd_file_unhash_and_put(struct nfsd_file *nf)
-{
- if (nfsd_file_unhash(nf))
- nfsd_file_put_noref(nf);
-}
-
+/**
+ * nfsd_file_put - put the reference to a nfsd_file
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Put a reference to a nfsd_file. In the non-GC case, we just put the
+ * reference immediately. In the GC case, if the reference would be
+ * the last one, the put it on the LRU instead to be cleaned up later.
+ */
void
nfsd_file_put(struct nfsd_file *nf)
{
might_sleep();
+ trace_nfsd_file_put(nf);
- if (test_bit(NFSD_FILE_GC, &nf->nf_flags))
- nfsd_file_lru_add(nf);
- else if (refcount_read(&nf->nf_ref) == 2)
- nfsd_file_unhash_and_put(nf);
-
- if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_fsync(nf);
- nfsd_file_put_noref(nf);
- } else if (nf->nf_file && test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
- nfsd_file_put_noref(nf);
- nfsd_file_schedule_laundrette();
- } else
- nfsd_file_put_noref(nf);
-}
-
-static void
-nfsd_file_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
+ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
+ test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ /*
+ * If this is the last reference (nf_ref == 1), then try to
+ * transfer it to the LRU.
+ */
+ if (refcount_dec_not_one(&nf->nf_ref))
+ return;
+
+ /* Try to add it to the LRU. If that fails, decrement. */
+ if (nfsd_file_lru_add(nf)) {
+ /* If it's still hashed, we're done */
+ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+ nfsd_file_schedule_laundrette();
+ return;
+ }
- while(!list_empty(dispose)) {
- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
- list_del_init(&nf->nf_lru);
- nfsd_file_fsync(nf);
- nfsd_file_put_noref(nf);
+ /*
+ * We're racing with unhashing, so try to remove it from
+ * the LRU. If removal fails, then someone else already
+ * has our reference.
+ */
+ if (!nfsd_file_lru_remove(nf))
+ return;
+ }
}
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
}
static void
-nfsd_file_dispose_list_sync(struct list_head *dispose)
+nfsd_file_dispose_list(struct list_head *dispose)
{
- bool flush = false;
struct nfsd_file *nf;
- while(!list_empty(dispose)) {
+ while (!list_empty(dispose)) {
nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
list_del_init(&nf->nf_lru);
- nfsd_file_fsync(nf);
- if (!refcount_dec_and_test(&nf->nf_ref))
- continue;
- if (nfsd_file_free(nf))
- flush = true;
+ nfsd_file_free(nf);
}
- if (flush)
- flush_delayed_fput();
}
static void
@@ -591,21 +580,8 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
struct list_head *head = arg;
struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
- /*
- * Do a lockless refcount check. The hashtable holds one reference, so
- * we look to see if anything else has a reference, or if any have
- * been put since the shrinker last ran. Those don't get unhashed and
- * released.
- *
- * Note that in the put path, we set the flag and then decrement the
- * counter. Here we check the counter and then test and clear the flag.
- * That order is deliberate to ensure that we can do this locklessly.
- */
- if (refcount_read(&nf->nf_ref) > 1) {
- list_lru_isolate(lru, &nf->nf_lru);
- trace_nfsd_file_gc_in_use(nf);
- return LRU_REMOVED;
- }
+ /* We should only be dealing with GC entries here */
+ WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
/*
* Don't throw out files that are still undergoing I/O or
@@ -616,40 +592,30 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
return LRU_SKIP;
}
+ /* If it was recently added to the list, skip it */
if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
trace_nfsd_file_gc_referenced(nf);
return LRU_ROTATE;
}
- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- trace_nfsd_file_gc_hashed(nf);
- return LRU_SKIP;
+ /*
+ * Put the reference held on behalf of the LRU. If it wasn't the last
+ * one, then just remove it from the LRU and ignore it.
+ */
+ if (!refcount_dec_and_test(&nf->nf_ref)) {
+ trace_nfsd_file_gc_in_use(nf);
+ list_lru_isolate(lru, &nf->nf_lru);
+ return LRU_REMOVED;
}
+ /* Refcount went to zero. Unhash it and queue it to the dispose list */
+ nfsd_file_unhash(nf);
list_lru_isolate_move(lru, &nf->nf_lru, head);
this_cpu_inc(nfsd_file_evictions);
trace_nfsd_file_gc_disposed(nf);
return LRU_REMOVED;
}
-/*
- * Unhash items on @dispose immediately, then queue them on the
- * disposal workqueue to finish releasing them in the background.
- *
- * cel: Note that between the time list_lru_shrink_walk runs and
- * now, these items are in the hash table but marked unhashed.
- * Why release these outside of lru_cb ? There's no lock ordering
- * problem since lru_cb currently takes no lock.
- */
-static void nfsd_file_gc_dispose_list(struct list_head *dispose)
-{
- struct nfsd_file *nf;
-
- list_for_each_entry(nf, dispose, nf_lru)
- nfsd_file_hash_remove(nf);
- nfsd_file_dispose_list_delayed(dispose);
-}
-
static void
nfsd_file_gc(void)
{
@@ -659,7 +625,7 @@ nfsd_file_gc(void)
ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
&dispose, list_lru_count(&nfsd_file_lru));
trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
static void
@@ -685,7 +651,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
nfsd_file_lru_cb, &dispose);
trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
- nfsd_file_gc_dispose_list(&dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
return ret;
}
@@ -695,18 +661,62 @@ static struct shrinker nfsd_file_shrinker = {
.seeks = 1,
};
-/*
- * Find all cache items across all net namespaces that match @inode and
- * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire().
+/**
+ * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file
+ * @nf: nfsd_file to attempt to queue
+ * @dispose: private list to queue successfully-put objects
+ *
+ * Unhash an nfsd_file, try to get a reference to it, and then put that
+ * reference. If it's the last reference, queue it to the dispose list.
+ */
+static void
+nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
+ __must_hold(RCU)
+{
+ int decrement = 1;
+
+ /* If we raced with someone else unhashing, ignore it */
+ if (!nfsd_file_unhash(nf))
+ return;
+
+ /* If we can't get a reference, ignore it */
+ if (!nfsd_file_get(nf))
+ return;
+
+ /* Extra decrement if we remove from the LRU */
+ if (nfsd_file_lru_remove(nf))
+ ++decrement;
+
+ /* If refcount goes to 0, then put on the dispose list */
+ if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
+ list_add(&nf->nf_lru, dispose);
+ trace_nfsd_file_closing(nf);
+ }
+}
+
+/**
+ * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode
+ * @inode: inode on which to close out nfsd_files
+ * @dispose: list on which to gather nfsd_files to close out
+ *
+ * An nfsd_file represents a struct file being held open on behalf of nfsd. An
+ * open file however can block other activity (such as leases), or cause
+ * undesirable behavior (e.g. spurious silly-renames when reexporting NFS).
+ *
+ * This function is intended to find open nfsd_files when this sort of
+ * conflicting access occurs and then attempt to close those files out.
+ *
+ * Populates the dispose list with entries that have already had their
+ * refcounts go to zero. The actual free of an nfsd_file can be expensive,
+ * so we leave it up to the caller whether it wants to wait or not.
*/
-static unsigned int
-__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
+static void
+nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose)
{
struct nfsd_file_lookup_key key = {
.type = NFSD_FILE_KEY_INODE,
.inode = inode,
};
- unsigned int count = 0;
struct nfsd_file *nf;
rcu_read_lock();
@@ -715,52 +725,61 @@ __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose)
nfsd_file_rhash_params);
if (!nf)
break;
- nfsd_file_unhash_and_queue(nf, dispose);
- count++;
+ nfsd_file_cond_queue(nf, dispose);
} while (1);
rcu_read_unlock();
- return count;
}
/**
- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put, then flush and fput all cache items associated with @inode.
+ * Close out any open nfsd_files that can be reaped for @inode. The
+ * actual freeing is deferred to the dispose_list_delayed infrastructure.
+ *
+ * This is used by the fsnotify callbacks and setlease notifier.
*/
-void
-nfsd_file_close_inode_sync(struct inode *inode)
+static void
+nfsd_file_close_inode(struct inode *inode)
{
LIST_HEAD(dispose);
- unsigned int count;
- count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode_sync(inode, count);
- nfsd_file_dispose_list_sync(&dispose);
+ nfsd_file_queue_for_close(inode, &dispose);
+ nfsd_file_dispose_list_delayed(&dispose);
}
/**
- * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
* @inode: inode of the file to attempt to remove
*
- * Unhash and put all cache item associated with @inode.
+ * Close out any open nfsd_files that can be reaped for @inode. The
+ * nfsd_files are closed out synchronously.
+ *
+ * This is called from nfsd_rename and nfsd_unlink to avoid silly-renames
+ * when reexporting NFS.
*/
-static void
-nfsd_file_close_inode(struct inode *inode)
+void
+nfsd_file_close_inode_sync(struct inode *inode)
{
+ struct nfsd_file *nf;
LIST_HEAD(dispose);
- unsigned int count;
- count = __nfsd_file_close_inode(inode, &dispose);
- trace_nfsd_file_close_inode(inode, count);
- nfsd_file_dispose_list_delayed(&dispose);
+ trace_nfsd_file_close(inode);
+
+ nfsd_file_queue_for_close(inode, &dispose);
+ while (!list_empty(&dispose)) {
+ nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
+ list_del_init(&nf->nf_lru);
+ nfsd_file_free(nf);
+ }
+ flush_delayed_fput();
}
/**
* nfsd_file_delayed_close - close unused nfsd_files
* @work: dummy
*
- * Walk the LRU list and close any entries that have not been used since
+ * Walk the LRU list and destroy any entries that have not been used since
* the last scan.
*/
static void
@@ -782,7 +801,7 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
/* Only close files for F_SETLEASE leases */
if (fl->fl_flags & FL_LEASE)
- nfsd_file_close_inode_sync(file_inode(fl->fl_file));
+ nfsd_file_close_inode(file_inode(fl->fl_file));
return 0;
}
@@ -903,6 +922,13 @@ out_err:
goto out;
}
+/**
+ * __nfsd_file_cache_purge: clean out the cache for shutdown
+ * @net: net-namespace to shut down the cache (may be NULL)
+ *
+ * Walk the nfsd_file cache and close out any that match @net. If @net is NULL,
+ * then close out everything. Called when an nfsd instance is being shut down.
+ */
static void
__nfsd_file_cache_purge(struct net *net)
{
@@ -917,7 +943,7 @@ __nfsd_file_cache_purge(struct net *net)
nf = rhashtable_walk_next(&iter);
while (!IS_ERR_OR_NULL(nf)) {
if (!net || nf->nf_net == net)
- nfsd_file_unhash_and_queue(nf, &dispose);
+ nfsd_file_cond_queue(nf, &dispose);
nf = rhashtable_walk_next(&iter);
}
@@ -1056,8 +1082,8 @@ nfsd_file_is_cached(struct inode *inode)
static __be32
nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf,
- bool open, bool want_gc)
+ unsigned int may_flags, struct file *file,
+ struct nfsd_file **pnf, bool want_gc)
{
struct nfsd_file_lookup_key key = {
.type = NFSD_FILE_KEY_FULL,
@@ -1084,8 +1110,12 @@ retry:
if (nf)
nf = nfsd_file_get(nf);
rcu_read_unlock();
- if (nf)
+
+ if (nf) {
+ if (nfsd_file_lru_remove(nf))
+ WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref));
goto wait_for_construction;
+ }
nf = nfsd_file_alloc(&key, may_flags);
if (!nf) {
@@ -1118,48 +1148,53 @@ wait_for_construction:
goto out;
}
open_retry = false;
- nfsd_file_put_noref(nf);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
goto retry;
}
- nfsd_file_lru_remove(nf);
this_cpu_inc(nfsd_file_cache_hits);
status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
out:
if (status == nfs_ok) {
- if (open)
- this_cpu_inc(nfsd_file_acquisitions);
+ this_cpu_inc(nfsd_file_acquisitions);
*pnf = nf;
} else {
- nfsd_file_put(nf);
+ if (refcount_dec_and_test(&nf->nf_ref))
+ nfsd_file_free(nf);
nf = NULL;
}
out_status:
put_cred(key.cred);
- if (open)
- trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status);
+ trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status);
return status;
open_file:
trace_nfsd_file_alloc(nf);
nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode);
if (nf->nf_mark) {
- if (open) {
+ if (file) {
+ get_file(file);
+ nf->nf_file = file;
+ status = nfs_ok;
+ trace_nfsd_file_opened(nf, status);
+ } else {
status = nfsd_open_verified(rqstp, fhp, may_flags,
&nf->nf_file);
trace_nfsd_file_open(nf, status);
- } else
- status = nfs_ok;
+ }
} else
status = nfserr_jukebox;
/*
* If construction failed, or we raced with a call to unlink()
* then unhash.
*/
- if (status != nfs_ok || key.inode->i_nlink == 0)
- nfsd_file_unhash_and_put(nf);
+ if (status == nfs_ok && key.inode->i_nlink == 0)
+ status = nfserr_jukebox;
+ if (status != nfs_ok)
+ nfsd_file_unhash(nf);
clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
smp_mb__after_atomic();
wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
@@ -1185,7 +1220,7 @@ __be32
nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, true);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, true);
}
/**
@@ -1206,28 +1241,30 @@ __be32
nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true, false);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, false);
}
/**
- * nfsd_file_create - Get a struct nfsd_file, do not open
+ * nfsd_file_acquire_opened - Get a struct nfsd_file using existing open file
* @rqstp: the RPC transaction being executed
* @fhp: the NFS filehandle of the file just created
* @may_flags: NFSD_MAY_ settings for the file
+ * @file: cached, already-open file (may be NULL)
* @pnf: OUT: new or found "struct nfsd_file" object
*
- * The nfsd_file_object returned by this API is reference-counted
- * but not garbage-collected. The object is released immediately
- * one RCU grace period after the final nfsd_file_put().
+ * Acquire a nfsd_file object that is not GC'ed. If one doesn't already exist,
+ * and @file is non-NULL, use it to instantiate a new nfsd_file instead of
+ * opening a new one.
*
* Returns nfs_ok and sets @pnf on success; otherwise an nfsstat in
* network byte order is returned.
*/
__be32
-nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **pnf)
+nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct file *file,
+ struct nfsd_file **pnf)
{
- return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false, false);
+ return nfsd_file_do_acquire(rqstp, fhp, may_flags, file, pnf, false);
}
/*
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index b7efb2c3ddb1..41516a4263ea 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -60,7 +60,8 @@ __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int may_flags, struct nfsd_file **nfp);
-__be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
- unsigned int may_flags, struct nfsd_file **nfp);
+__be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct file *file,
+ struct nfsd_file **nfp);
int nfsd_file_cache_stats_show(struct seq_file *m, void *v);
#endif /* _FS_NFSD_FILECACHE_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 8c854ba3285b..51a4b7885cae 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -195,7 +195,7 @@ struct nfsd_net {
atomic_t nfsd_courtesy_clients;
struct shrinker nfsd_client_shrinker;
- struct delayed_work nfsd_shrinker_work;
+ struct work_struct nfsd_shrinker_work;
};
/* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 1b57f2c2f0bb..2a815f5a52c4 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -942,7 +942,7 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r
} else {
struct cred *kcred;
- kcred = prepare_kernel_cred(NULL);
+ kcred = prepare_kernel_cred(&init_task);
if (!kcred)
return NULL;
@@ -988,7 +988,6 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
} else {
if (!conn->cb_xprt)
return -EINVAL;
- clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_session = ses;
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
@@ -1008,6 +1007,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
rpc_shutdown_client(client);
return -ENOMEM;
}
+
+ if (clp->cl_minorversion != 0)
+ clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
rcu_read_lock();
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 73ed32ad23a2..f189ba7995f5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -937,7 +937,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* the client wants us to do more in this compound:
*/
if (!nfsd4_last_compound_op(rqstp))
- __clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
/* check stateid */
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
@@ -1318,6 +1318,7 @@ try_again:
/* allow 20secs for mount/unmount for now - revisit */
if (signal_pending(current) ||
(schedule_timeout(20*HZ) == 0)) {
+ finish_wait(&nn->nfsd_ssc_waitq, &wait);
kfree(work);
return nfserr_eagain;
}
@@ -1461,13 +1462,6 @@ out_err:
return status;
}
-static void
-nfsd4_interssc_disconnect(struct vfsmount *ss_mnt)
-{
- nfs_do_sb_deactive(ss_mnt->mnt_sb);
- mntput(ss_mnt);
-}
-
/*
* Verify COPY destination stateid.
*
@@ -1570,11 +1564,6 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct file *filp,
{
}
-static void
-nfsd4_interssc_disconnect(struct vfsmount *ss_mnt)
-{
-}
-
static struct file *nfs42_ssc_open(struct vfsmount *ss_mnt,
struct nfs_fh *src_fh,
nfs4_stateid *stateid)
@@ -1770,7 +1759,7 @@ static int nfsd4_do_async_copy(void *data)
default:
nfserr = nfserr_offload_denied;
}
- nfsd4_interssc_disconnect(copy->ss_mnt);
+ /* ss_mnt will be unmounted by the laundromat */
goto do_callback;
}
nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
@@ -1851,8 +1840,10 @@ out_err:
if (async_copy)
cleanup_async_copy(async_copy);
status = nfserrno(-ENOMEM);
- if (nfsd4_ssc_is_inter(copy))
- nfsd4_interssc_disconnect(copy->ss_mnt);
+ /*
+ * source's vfsmount of inter-copy will be unmounted
+ * by the laundromat
+ */
goto out;
}
@@ -2617,12 +2608,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
cstate->minorversion = args->minorversion;
fh_init(current_fh, NFS4_FHSIZE);
fh_init(save_fh, NFS4_FHSIZE);
-
/*
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
*/
- __clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
/*
* According to RFC3010, this takes precedence over all other errors.
@@ -2744,7 +2734,7 @@ encode_op:
out:
cstate->status = status;
/* Reset deferral mechanism for RPC deferrals */
- __set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
+ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags);
return rpc_success;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7b2ee535ade8..4ef529379065 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4411,7 +4411,7 @@ nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
if (!count)
count = atomic_long_read(&num_delegations);
if (count)
- mod_delayed_work(laundry_wq, &nn->nfsd_shrinker_work, 0);
+ queue_work(laundry_wq, &nn->nfsd_shrinker_work);
return (unsigned long)count;
}
@@ -4421,7 +4421,7 @@ nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
return SHRINK_STOP;
}
-int
+void
nfsd4_init_leases_net(struct nfsd_net *nn)
{
struct sysinfo si;
@@ -4443,16 +4443,6 @@ nfsd4_init_leases_net(struct nfsd_net *nn)
nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
atomic_set(&nn->nfsd_courtesy_clients, 0);
- nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
- nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
- nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
- return register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client");
-}
-
-void
-nfsd4_leases_net_shutdown(struct nfsd_net *nn)
-{
- unregister_shrinker(&nn->nfsd_client_shrinker);
}
static void init_nfs4_replay(struct nfs4_replay *rp)
@@ -5262,18 +5252,10 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
if (!fp->fi_fds[oflag]) {
spin_unlock(&fp->fi_lock);
- if (!open->op_filp) {
- status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
- if (status != nfs_ok)
- goto out_put_access;
- } else {
- status = nfsd_file_create(rqstp, cur_fh, access, &nf);
- if (status != nfs_ok)
- goto out_put_access;
- nf->nf_file = open->op_filp;
- open->op_filp = NULL;
- trace_nfsd_file_create(rqstp, access, nf);
- }
+ status = nfsd_file_acquire_opened(rqstp, cur_fh, access,
+ open->op_filp, &nf);
+ if (status != nfs_ok)
+ goto out_put_access;
spin_lock(&fp->fi_lock);
if (!fp->fi_fds[oflag]) {
@@ -6243,8 +6225,7 @@ deleg_reaper(struct nfsd_net *nn)
static void
nfsd4_state_shrinker_worker(struct work_struct *work)
{
- struct delayed_work *dwork = to_delayed_work(work);
- struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
+ struct nfsd_net *nn = container_of(work, struct nfsd_net,
nfsd_shrinker_work);
courtesy_client_reaper(nn);
@@ -8074,11 +8055,20 @@ static int nfs4_state_create_net(struct net *net)
INIT_LIST_HEAD(&nn->blocked_locks_lru);
INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
- INIT_DELAYED_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
+ INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
get_net(net);
+ nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
+ nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
+ nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
+
+ if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client"))
+ goto err_shrinker;
return 0;
+err_shrinker:
+ put_net(net);
+ kfree(nn->sessionid_hashtbl);
err_sessionid:
kfree(nn->unconf_id_hashtbl);
err_unconf_id:
@@ -8171,6 +8161,8 @@ nfs4_state_shutdown_net(struct net *net)
struct list_head *pos, *next, reaplist;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ unregister_shrinker(&nn->nfsd_client_shrinker);
+ cancel_work(&nn->nfsd_shrinker_work);
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 2b4ae858c89b..97edb32be77f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2523,7 +2523,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
- __clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+ clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
return true;
}
@@ -3629,6 +3629,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
case nfserr_noent:
xdr_truncate_encode(xdr, start_offset);
goto skip_entry;
+ case nfserr_jukebox:
+ /*
+ * The pseudoroot should only display dentries that lead to
+ * exports. If we get EJUKEBOX here, then we can't tell whether
+ * this entry should be included. Just fail the whole READDIR
+ * with NFS4ERR_DELAY in that case, and hope that the situation
+ * will resolve itself by the client's next attempt.
+ */
+ if (cd->rd_fhp->fh_export->ex_flags & NFSEXP_V4ROOT)
+ goto fail;
+ fallthrough;
default:
/*
* If the client requested the RDATTR_ERROR attribute,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index d1e581a60480..c2577ee7ffb2 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1457,9 +1457,7 @@ static __net_init int nfsd_init_net(struct net *net)
goto out_idmap_error;
nn->nfsd_versions = NULL;
nn->nfsd4_minorversions = NULL;
- retval = nfsd4_init_leases_net(nn);
- if (retval)
- goto out_drc_error;
+ nfsd4_init_leases_net(nn);
retval = nfsd_reply_cache_init(nn);
if (retval)
goto out_cache_error;
@@ -1469,8 +1467,6 @@ static __net_init int nfsd_init_net(struct net *net)
return 0;
out_cache_error:
- nfsd4_leases_net_shutdown(nn);
-out_drc_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
nfsd_export_shutdown(net);
@@ -1486,7 +1482,6 @@ static __net_exit void nfsd_exit_net(struct net *net)
nfsd_idmap_shutdown(net);
nfsd_export_shutdown(net);
nfsd_netns_free_versions(net_generic(net, nfsd_net_id));
- nfsd4_leases_net_shutdown(nn);
}
static struct pernet_operations nfsd_net_ops = {
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 93b42ef9ed91..fa0144a74267 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -504,8 +504,7 @@ extern void unregister_cld_notifier(void);
extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
#endif
-extern int nfsd4_init_leases_net(struct nfsd_net *nn);
-extern void nfsd4_leases_net_shutdown(struct nfsd_net *nn);
+extern void nfsd4_init_leases_net(struct nfsd_net *nn);
#else /* CONFIG_NFSD_V4 */
static inline int nfsd4_is_junction(struct dentry *dentry)
@@ -513,8 +512,7 @@ static inline int nfsd4_is_junction(struct dentry *dentry)
return 0;
}
-static inline int nfsd4_init_leases_net(struct nfsd_net *nn) { return 0; };
-static inline void nfsd4_leases_net_shutdown(struct nfsd_net *nn) {};
+static inline void nfsd4_init_leases_net(struct nfsd_net *nn) { };
#define register_cld_notifier() 0
#define unregister_cld_notifier() do { } while(0)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a5570cf75f3f..9744443c3965 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -211,7 +211,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
- __set_bit(RQ_DROPME, &rqstp->rq_flags);
+ set_bit(RQ_DROPME, &rqstp->rq_flags);
return rpc_success;
}
@@ -246,7 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
- __set_bit(RQ_DROPME, &rqstp->rq_flags);
+ set_bit(RQ_DROPME, &rqstp->rq_flags);
return rpc_success;
}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 56fba1cba3af..325d3d3f1211 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -453,8 +453,8 @@ static void nfsd_shutdown_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- nfsd_file_cache_shutdown_net(net);
nfs4_state_shutdown_net(net);
+ nfsd_file_cache_shutdown_net(net);
if (nn->lockd_up) {
lockd_down(net);
nn->lockd_up = false;
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 46b8f68a2497..8f9c82d9e075 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -876,8 +876,8 @@ DEFINE_CLID_EVENT(confirmed_r);
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
{ 1 << NFSD_FILE_PENDING, "PENDING" }, \
- { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}, \
- { 1 << NFSD_FILE_GC, "GC"})
+ { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \
+ { 1 << NFSD_FILE_GC, "GC" })
DECLARE_EVENT_CLASS(nfsd_file_class,
TP_PROTO(struct nfsd_file *nf),
@@ -912,6 +912,7 @@ DEFINE_EVENT(nfsd_file_class, name, \
DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
+DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
TRACE_EVENT(nfsd_file_alloc,
@@ -980,43 +981,6 @@ TRACE_EVENT(nfsd_file_acquire,
)
);
-TRACE_EVENT(nfsd_file_create,
- TP_PROTO(
- const struct svc_rqst *rqstp,
- unsigned int may_flags,
- const struct nfsd_file *nf
- ),
-
- TP_ARGS(rqstp, may_flags, nf),
-
- TP_STRUCT__entry(
- __field(const void *, nf_inode)
- __field(const void *, nf_file)
- __field(unsigned long, may_flags)
- __field(unsigned long, nf_flags)
- __field(unsigned long, nf_may)
- __field(unsigned int, nf_ref)
- __field(u32, xid)
- ),
-
- TP_fast_assign(
- __entry->nf_inode = nf->nf_inode;
- __entry->nf_file = nf->nf_file;
- __entry->may_flags = may_flags;
- __entry->nf_flags = nf->nf_flags;
- __entry->nf_may = nf->nf_may;
- __entry->nf_ref = refcount_read(&nf->nf_ref);
- __entry->xid = be32_to_cpu(rqstp->rq_xid);
- ),
-
- TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p",
- __entry->xid, __entry->nf_inode,
- show_nfsd_may_flags(__entry->may_flags),
- __entry->nf_ref, show_nf_flags(__entry->nf_flags),
- show_nfsd_may_flags(__entry->nf_may), __entry->nf_file
- )
-);
-
TRACE_EVENT(nfsd_file_insert_err,
TP_PROTO(
const struct svc_rqst *rqstp,
@@ -1078,8 +1042,8 @@ TRACE_EVENT(nfsd_file_cons_err,
)
);
-TRACE_EVENT(nfsd_file_open,
- TP_PROTO(struct nfsd_file *nf, __be32 status),
+DECLARE_EVENT_CLASS(nfsd_file_open_class,
+ TP_PROTO(const struct nfsd_file *nf, __be32 status),
TP_ARGS(nf, status),
TP_STRUCT__entry(
__field(void *, nf_inode) /* cannot be dereferenced */
@@ -1103,34 +1067,16 @@ TRACE_EVENT(nfsd_file_open,
__entry->nf_file)
)
-DECLARE_EVENT_CLASS(nfsd_file_search_class,
- TP_PROTO(
- const struct inode *inode,
- unsigned int count
- ),
- TP_ARGS(inode, count),
- TP_STRUCT__entry(
- __field(const struct inode *, inode)
- __field(unsigned int, count)
- ),
- TP_fast_assign(
- __entry->inode = inode;
- __entry->count = count;
- ),
- TP_printk("inode=%p count=%u",
- __entry->inode, __entry->count)
-);
-
-#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \
-DEFINE_EVENT(nfsd_file_search_class, name, \
+#define DEFINE_NFSD_FILE_OPEN_EVENT(name) \
+DEFINE_EVENT(nfsd_file_open_class, name, \
TP_PROTO( \
- const struct inode *inode, \
- unsigned int count \
+ const struct nfsd_file *nf, \
+ __be32 status \
), \
- TP_ARGS(inode, count))
+ TP_ARGS(nf, status))
-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
+DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_open);
+DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_opened);
TRACE_EVENT(nfsd_file_is_cached,
TP_PROTO(
@@ -1209,7 +1155,6 @@ DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
-DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_hashed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed);
DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class,
@@ -1241,6 +1186,22 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \
DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
+TRACE_EVENT(nfsd_file_close,
+ TP_PROTO(
+ const struct inode *inode
+ ),
+ TP_ARGS(inode),
+ TP_STRUCT__entry(
+ __field(const void *, inode)
+ ),
+ TP_fast_assign(
+ __entry->inode = inode;
+ ),
+ TP_printk("inode=%p",
+ __entry->inode
+ )
+);
+
TRACE_EVENT(nfsd_file_fsync,
TP_PROTO(
const struct nfsd_file *nf,
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b9d15c3df3cc..40ce92a332fe 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -480,9 +480,18 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, &bh,
&submit_ptr);
if (ret) {
- if (ret != -EEXIST)
- return ret;
- goto out_check;
+ if (likely(ret == -EEXIST))
+ goto out_check;
+ if (ret == -ENOENT) {
+ /*
+ * Block address translation failed due to invalid
+ * value of 'ptr'. In this case, return internal code
+ * -EINVAL (broken bmap) to notify bmap layer of fatal
+ * metadata corruption.
+ */
+ ret = -EINVAL;
+ }
+ return ret;
}
if (ra) {
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3335ef352915..76c3bd88b858 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2752,7 +2752,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
down_write(&nilfs->ns_segctor_sem);
- del_timer_sync(&sci->sc_timer);
+ timer_shutdown_sync(&sci->sc_timer);
kfree(sci);
}
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 71f870d497ae..5e6bafb10f42 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -55,33 +55,6 @@ static inline u64 get_pre_allocated(u64 size)
}
/*
- * attr_must_be_resident
- *
- * Return: True if attribute must be resident.
- */
-static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi,
- enum ATTR_TYPE type)
-{
- const struct ATTR_DEF_ENTRY *de;
-
- switch (type) {
- case ATTR_STD:
- case ATTR_NAME:
- case ATTR_ID:
- case ATTR_LABEL:
- case ATTR_VOL_INFO:
- case ATTR_ROOT:
- case ATTR_EA_INFO:
- return true;
- default:
- de = ntfs_query_def(sbi, type);
- if (de && (de->flags & NTFS_ATTR_MUST_BE_RESIDENT))
- return true;
- return false;
- }
-}
-
-/*
* attr_load_runs - Load all runs stored in @attr.
*/
static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
@@ -101,6 +74,10 @@ static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
asize = le32_to_cpu(attr->size);
run_off = le16_to_cpu(attr->nres.run_off);
+
+ if (run_off > asize)
+ return -EINVAL;
+
err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn,
vcn ? *vcn : svcn, Add2Ptr(attr, run_off),
asize - run_off);
@@ -172,7 +149,7 @@ out:
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
- CLST *new_lcn)
+ CLST *new_lcn, CLST *new_len)
{
int err;
CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0;
@@ -192,20 +169,36 @@ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
if (err)
goto out;
- if (new_lcn && vcn == vcn0)
- *new_lcn = lcn;
+ if (vcn == vcn0) {
+ /* Return the first fragment. */
+ if (new_lcn)
+ *new_lcn = lcn;
+ if (new_len)
+ *new_len = flen;
+ }
/* Add new fragment into run storage. */
- if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) {
+ if (!run_add_entry(run, vcn, lcn, flen, opt & ALLOCATE_MFT)) {
/* Undo last 'ntfs_look_for_free_space' */
mark_as_free_ex(sbi, lcn, len, false);
err = -ENOMEM;
goto out;
}
+ if (opt & ALLOCATE_ZERO) {
+ u8 shift = sbi->cluster_bits - SECTOR_SHIFT;
+
+ err = blkdev_issue_zeroout(sbi->sb->s_bdev,
+ (sector_t)lcn << shift,
+ (sector_t)flen << shift,
+ GFP_NOFS, 0);
+ if (err)
+ goto out;
+ }
+
vcn += flen;
- if (flen >= len || opt == ALLOCATE_MFT ||
+ if (flen >= len || (opt & ALLOCATE_MFT) ||
(fr && run->count - cnt >= fr)) {
*alen = vcn - vcn0;
return 0;
@@ -280,7 +273,8 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
const char *data = resident_data(attr);
err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL,
- ALLOCATE_DEF, &alen, 0, NULL);
+ ALLOCATE_DEF, &alen, 0, NULL,
+ NULL);
if (err)
goto out1;
@@ -420,6 +414,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn;
CLST next_svcn, pre_alloc = -1, done = 0;
bool is_ext, is_bad = false;
+ bool dirty = false;
u32 align;
struct MFT_REC *rec;
@@ -440,8 +435,10 @@ again:
return err;
/* Return if file is still resident. */
- if (!attr_b->non_res)
+ if (!attr_b->non_res) {
+ dirty = true;
goto ok1;
+ }
/* Layout of records may be changed, so do a full search. */
goto again;
@@ -464,7 +461,7 @@ again_1:
if (keep_prealloc && new_size < old_size) {
attr_b->nres.data_size = cpu_to_le64(new_size);
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
goto ok;
}
@@ -510,7 +507,7 @@ next_le:
if (new_alloc <= old_alloc) {
attr_b->nres.data_size = cpu_to_le64(new_size);
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
goto ok;
}
@@ -575,13 +572,13 @@ add_alloc_in_same_attr_seg:
/* ~3 bytes per fragment. */
err = attr_allocate_clusters(
sbi, run, vcn, lcn, to_allocate, &pre_alloc,
- is_mft ? ALLOCATE_MFT : 0, &alen,
+ is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen,
is_mft ? 0
: (sbi->record_size -
le32_to_cpu(rec->used) + 8) /
3 +
1,
- NULL);
+ NULL, NULL);
if (err)
goto out;
}
@@ -601,7 +598,7 @@ pack_runs:
next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
new_alloc_tmp = (u64)next_svcn << cluster_bits;
attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp);
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
if (next_svcn >= vcn && !to_allocate) {
/* Normal way. Update attribute and exit. */
@@ -687,7 +684,7 @@ pack_runs:
old_valid = old_size = old_alloc = (u64)vcn << cluster_bits;
attr_b->nres.valid_size = attr_b->nres.data_size =
attr_b->nres.alloc_size = cpu_to_le64(old_size);
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
goto again_1;
}
@@ -749,7 +746,7 @@ pack_runs:
attr_b->nres.valid_size =
attr_b->nres.alloc_size;
}
- mi_b->dirty = true;
+ mi_b->dirty = dirty = true;
err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen,
true);
@@ -810,16 +807,9 @@ ok1:
if (ret)
*ret = attr_b;
- /* Update inode_set_bytes. */
if (((type == ATTR_DATA && !name_len) ||
(type == ATTR_ALLOC && name == I30_NAME))) {
- bool dirty = false;
-
- if (ni->vfs_inode.i_size != new_size) {
- ni->vfs_inode.i_size = new_size;
- dirty = true;
- }
-
+ /* Update inode_set_bytes. */
if (attr_b->non_res) {
new_alloc = le64_to_cpu(attr_b->nres.alloc_size);
if (inode_get_bytes(&ni->vfs_inode) != new_alloc) {
@@ -828,6 +818,7 @@ ok1:
}
}
+ /* Don't forget to update duplicate information in parent. */
if (dirty) {
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
mark_inode_dirty(&ni->vfs_inode);
@@ -878,8 +869,19 @@ bad_inode:
return err;
}
+/*
+ * attr_data_get_block - Returns 'lcn' and 'len' for given 'vcn'.
+ *
+ * @new == NULL means just to get current mapping for 'vcn'
+ * @new != NULL means allocate real cluster if 'vcn' maps to hole
+ * @zero - zeroout new allocated clusters
+ *
+ * NOTE:
+ * - @new != NULL is called only for sparsed or compressed attributes.
+ * - new allocated clusters are zeroed via blkdev_issue_zeroout.
+ */
int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
- CLST *len, bool *new)
+ CLST *len, bool *new, bool zero)
{
int err = 0;
struct runs_tree *run = &ni->file.run;
@@ -888,29 +890,29 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
- CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end;
- u64 total_size;
- u32 clst_per_frame;
- bool ok;
+ CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen;
+ CLST alloc, evcn;
+ unsigned fr;
+ u64 total_size, total_size0;
+ int step = 0;
if (new)
*new = false;
+ /* Try to find in cache. */
down_read(&ni->file.run_lock);
- ok = run_lookup_entry(run, vcn, lcn, len, NULL);
+ if (!run_lookup_entry(run, vcn, lcn, len, NULL))
+ *len = 0;
up_read(&ni->file.run_lock);
- if (ok && (*lcn != SPARSE_LCN || !new)) {
- /* Normal way. */
- return 0;
+ if (*len) {
+ if (*lcn != SPARSE_LCN || !new)
+ return 0; /* Fast normal way without allocation. */
+ else if (clen > *len)
+ clen = *len;
}
- if (!clen)
- clen = 1;
-
- if (ok && clen > *len)
- clen = *len;
-
+ /* No cluster in cache or we need to allocate cluster in hole. */
sbi = ni->mi.sbi;
cluster_bits = sbi->cluster_bits;
@@ -932,16 +934,15 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits;
if (vcn >= asize) {
- err = -EINVAL;
+ if (new) {
+ err = -EINVAL;
+ } else {
+ *len = 1;
+ *lcn = SPARSE_LCN;
+ }
goto out;
}
- clst_per_frame = 1u << attr_b->nres.c_unit;
- to_alloc = (clen + clst_per_frame - 1) & ~(clst_per_frame - 1);
-
- if (vcn + to_alloc > asize)
- to_alloc = asize - vcn;
-
svcn = le64_to_cpu(attr_b->nres.svcn);
evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
@@ -960,36 +961,68 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
}
+ /* Load in cache actual information. */
err = attr_load_runs(attr, ni, run, NULL);
if (err)
goto out;
- if (!ok) {
- ok = run_lookup_entry(run, vcn, lcn, len, NULL);
- if (ok && (*lcn != SPARSE_LCN || !new)) {
- /* Normal way. */
- err = 0;
- goto ok;
- }
+ if (!*len) {
+ if (run_lookup_entry(run, vcn, lcn, len, NULL)) {
+ if (*lcn != SPARSE_LCN || !new)
+ goto ok; /* Slow normal way without allocation. */
- if (!ok && !new) {
- *len = 0;
- err = 0;
+ if (clen > *len)
+ clen = *len;
+ } else if (!new) {
+ /* Here we may return -ENOENT.
+ * In any case caller gets zero length. */
goto ok;
}
-
- if (ok && clen > *len) {
- clen = *len;
- to_alloc = (clen + clst_per_frame - 1) &
- ~(clst_per_frame - 1);
- }
}
if (!is_attr_ext(attr_b)) {
+ /* The code below only for sparsed or compressed attributes. */
err = -EINVAL;
goto out;
}
+ vcn0 = vcn;
+ to_alloc = clen;
+ fr = (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1;
+ /* Allocate frame aligned clusters.
+ * ntfs.sys usually uses 16 clusters per frame for sparsed or compressed.
+ * ntfs3 uses 1 cluster per frame for new created sparsed files. */
+ if (attr_b->nres.c_unit) {
+ CLST clst_per_frame = 1u << attr_b->nres.c_unit;
+ CLST cmask = ~(clst_per_frame - 1);
+
+ /* Get frame aligned vcn and to_alloc. */
+ vcn = vcn0 & cmask;
+ to_alloc = ((vcn0 + clen + clst_per_frame - 1) & cmask) - vcn;
+ if (fr < clst_per_frame)
+ fr = clst_per_frame;
+ zero = true;
+
+ /* Check if 'vcn' and 'vcn0' in different attribute segments. */
+ if (vcn < svcn || evcn1 <= vcn) {
+ /* Load attribute for truncated vcn. */
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0,
+ &vcn, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (vcn + to_alloc > asize)
+ to_alloc = asize - vcn;
+
/* Get the last LCN to allocate from. */
hint = 0;
@@ -1003,18 +1036,35 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
hint = -1;
}
- err = attr_allocate_clusters(
- sbi, run, vcn, hint + 1, to_alloc, NULL, 0, len,
- (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1,
- lcn);
+ /* Allocate and zeroout new clusters. */
+ err = attr_allocate_clusters(sbi, run, vcn, hint + 1, to_alloc, NULL,
+ zero ? ALLOCATE_ZERO : ALLOCATE_DEF, &alen,
+ fr, lcn, len);
if (err)
goto out;
*new = true;
+ step = 1;
- end = vcn + *len;
+ end = vcn + alen;
+ /* Save 'total_size0' to restore if error. */
+ total_size0 = le64_to_cpu(attr_b->nres.total_size);
+ total_size = total_size0 + ((u64)alen << cluster_bits);
- total_size = le64_to_cpu(attr_b->nres.total_size) +
- ((u64)*len << cluster_bits);
+ if (vcn != vcn0) {
+ if (!run_lookup_entry(run, vcn0, lcn, len, NULL)) {
+ err = -EINVAL;
+ goto out;
+ }
+ if (*lcn == SPARSE_LCN) {
+ /* Internal error. Should not happened. */
+ WARN_ON(1);
+ err = -EINVAL;
+ goto out;
+ }
+ /* Check case when vcn0 + len overlaps new allocated clusters. */
+ if (vcn0 + *len > end)
+ *len = end - vcn0;
+ }
repack:
err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn);
@@ -1040,7 +1090,7 @@ repack:
if (!ni->attr_list.size) {
err = ni_create_attr_list(ni);
if (err)
- goto out;
+ goto undo1;
/* Layout of records is changed. */
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
@@ -1057,67 +1107,83 @@ repack:
}
}
+ /*
+ * The code below may require additional cluster (to extend attribute list)
+ * and / or one MFT record
+ * It is too complex to undo operations if -ENOSPC occurs deep inside
+ * in 'ni_insert_nonresident'.
+ * Return in advance -ENOSPC here if there are no free cluster and no free MFT.
+ */
+ if (!ntfs_check_for_free_space(sbi, 1, 1)) {
+ /* Undo step 1. */
+ err = -ENOSPC;
+ goto undo1;
+ }
+
+ step = 2;
svcn = evcn1;
/* Estimate next attribute. */
attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi);
- if (attr) {
- CLST alloc = bytes_to_cluster(
- sbi, le64_to_cpu(attr_b->nres.alloc_size));
- CLST evcn = le64_to_cpu(attr->nres.evcn);
-
- if (end < next_svcn)
- end = next_svcn;
- while (end > evcn) {
- /* Remove segment [svcn : evcn). */
- mi_remove_attr(NULL, mi, attr);
-
- if (!al_remove_le(ni, le)) {
- err = -EINVAL;
- goto out;
- }
+ if (!attr) {
+ /* Insert new attribute segment. */
+ goto ins_ext;
+ }
- if (evcn + 1 >= alloc) {
- /* Last attribute segment. */
- evcn1 = evcn + 1;
- goto ins_ext;
- }
+ /* Try to update existed attribute segment. */
+ alloc = bytes_to_cluster(sbi, le64_to_cpu(attr_b->nres.alloc_size));
+ evcn = le64_to_cpu(attr->nres.evcn);
- if (ni_load_mi(ni, le, &mi)) {
- attr = NULL;
- goto out;
- }
+ if (end < next_svcn)
+ end = next_svcn;
+ while (end > evcn) {
+ /* Remove segment [svcn : evcn). */
+ mi_remove_attr(NULL, mi, attr);
- attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0,
- &le->id);
- if (!attr) {
- err = -EINVAL;
- goto out;
- }
- svcn = le64_to_cpu(attr->nres.svcn);
- evcn = le64_to_cpu(attr->nres.evcn);
+ if (!al_remove_le(ni, le)) {
+ err = -EINVAL;
+ goto out;
}
- if (end < svcn)
- end = svcn;
+ if (evcn + 1 >= alloc) {
+ /* Last attribute segment. */
+ evcn1 = evcn + 1;
+ goto ins_ext;
+ }
- err = attr_load_runs(attr, ni, run, &end);
- if (err)
+ if (ni_load_mi(ni, le, &mi)) {
+ attr = NULL;
goto out;
+ }
- evcn1 = evcn + 1;
- attr->nres.svcn = cpu_to_le64(next_svcn);
- err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn);
- if (err)
+ attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id);
+ if (!attr) {
+ err = -EINVAL;
goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+ }
- le->vcn = cpu_to_le64(next_svcn);
- ni->attr_list.dirty = true;
- mi->dirty = true;
+ if (end < svcn)
+ end = svcn;
+
+ err = attr_load_runs(attr, ni, run, &end);
+ if (err)
+ goto out;
+
+ evcn1 = evcn + 1;
+ attr->nres.svcn = cpu_to_le64(next_svcn);
+ err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn);
+ if (err)
+ goto out;
+
+ le->vcn = cpu_to_le64(next_svcn);
+ ni->attr_list.dirty = true;
+ mi->dirty = true;
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
- next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
- }
ins_ext:
if (evcn1 > next_svcn) {
err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
@@ -1129,10 +1195,26 @@ ins_ext:
ok:
run_truncate_around(run, vcn);
out:
+ if (err && step > 1) {
+ /* Too complex to restore. */
+ _ntfs_bad_inode(&ni->vfs_inode);
+ }
up_write(&ni->file.run_lock);
ni_unlock(ni);
return err;
+
+undo1:
+ /* Undo step1. */
+ attr_b->nres.total_size = cpu_to_le64(total_size0);
+ inode_set_bytes(&ni->vfs_inode, total_size0);
+
+ if (run_deallocate_ex(sbi, run, vcn, alen, NULL, false) ||
+ !run_add_entry(run, vcn, SPARSE_LCN, alen, false) ||
+ mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn)) {
+ _ntfs_bad_inode(&ni->vfs_inode);
+ }
+ goto out;
}
int attr_data_read_resident(struct ntfs_inode *ni, struct page *page)
@@ -1217,6 +1299,11 @@ int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
CLST svcn, evcn;
u16 ro;
+ if (!ni) {
+ /* Is record corrupted? */
+ return -ENOENT;
+ }
+
attr = ni_find_attr(ni, NULL, NULL, type, name, name_len, &vcn, NULL);
if (!attr) {
/* Is record corrupted? */
@@ -1232,6 +1319,10 @@ int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
}
ro = le16_to_cpu(attr->nres.run_off);
+
+ if (ro > le32_to_cpu(attr->size))
+ return -EINVAL;
+
err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, svcn,
Add2Ptr(attr, ro), le32_to_cpu(attr->size) - ro);
if (err < 0)
@@ -1530,7 +1621,7 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
struct ATTRIB *attr = NULL, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
- CLST svcn, evcn1, next_svcn, lcn, len;
+ CLST svcn, evcn1, next_svcn, len;
CLST vcn, end, clst_data;
u64 total_size, valid_size, data_size;
@@ -1606,8 +1697,9 @@ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
}
err = attr_allocate_clusters(sbi, run, vcn + clst_data,
- hint + 1, len - clst_data, NULL, 0,
- &alen, 0, &lcn);
+ hint + 1, len - clst_data, NULL,
+ ALLOCATE_DEF, &alen, 0, NULL,
+ NULL);
if (err)
goto out;
@@ -1901,6 +1993,11 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
u16 le_sz;
u16 roff = le16_to_cpu(attr->nres.run_off);
+ if (roff > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn,
evcn1 - 1, svcn, Add2Ptr(attr, roff),
le32_to_cpu(attr->size) - roff);
@@ -2020,7 +2117,7 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
return -ENOENT;
if (!attr_b->non_res) {
- u32 data_size = le32_to_cpu(attr->res.data_size);
+ u32 data_size = le32_to_cpu(attr_b->res.data_size);
u32 from, to;
if (vbo > data_size)
@@ -2290,7 +2387,8 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
if (!attr_b->non_res) {
/* Still resident. */
- char *data = Add2Ptr(attr_b, attr_b->res.data_off);
+ char *data = Add2Ptr(attr_b,
+ le16_to_cpu(attr_b->res.data_off));
memmove(data + bytes, data, bytes);
memset(data, 0, bytes);
@@ -2382,8 +2480,8 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
if (vbo <= ni->i_valid)
ni->i_valid += bytes;
- attr_b->nres.data_size = le64_to_cpu(data_size + bytes);
- attr_b->nres.alloc_size = le64_to_cpu(alloc_size + bytes);
+ attr_b->nres.data_size = cpu_to_le64(data_size + bytes);
+ attr_b->nres.alloc_size = cpu_to_le64(alloc_size + bytes);
/* ni->valid may be not equal valid_size (temporary). */
if (ni->i_valid > data_size + bytes)
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index bad6d8a849a2..c0c6bcbc8c05 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -68,6 +68,11 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr)
run_init(&ni->attr_list.run);
+ if (run_off > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
err = run_unpack_ex(&ni->attr_list.run, ni->mi.sbi, ni->mi.rno,
0, le64_to_cpu(attr->nres.evcn), 0,
Add2Ptr(attr, run_off),
diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c
index 50d838093790..25a4d4896aa9 100644
--- a/fs/ntfs3/bitfunc.c
+++ b/fs/ntfs3/bitfunc.c
@@ -30,7 +30,7 @@ static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0,
*
* Return: True if all bits [bit, bit+nbits) are zeros "0".
*/
-bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits)
+bool are_bits_clear(const void *lmap, size_t bit, size_t nbits)
{
size_t pos = bit & 7;
const u8 *map = (u8 *)lmap + (bit >> 3);
@@ -78,7 +78,7 @@ bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits)
*
* Return: True if all bits [bit, bit+nbits) are ones "1".
*/
-bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits)
+bool are_bits_set(const void *lmap, size_t bit, size_t nbits)
{
u8 mask;
size_t pos = bit & 7;
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index e92bbd754365..723fb64e6531 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -59,14 +59,14 @@ void ntfs3_exit_bitmap(void)
*
* Return: -1 if not found.
*/
-static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend,
+static size_t wnd_scan(const void *buf, size_t wbit, u32 wpos, u32 wend,
size_t to_alloc, size_t *prev_tail, size_t *b_pos,
size_t *b_len)
{
while (wpos < wend) {
size_t free_len;
u32 free_bits, end;
- u32 used = find_next_zero_bit(buf, wend, wpos);
+ u32 used = find_next_zero_bit_le(buf, wend, wpos);
if (used >= wend) {
if (*b_len < *prev_tail) {
@@ -92,7 +92,7 @@ static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend,
* Now we have a fragment [wpos, wend) staring with 0.
*/
end = wpos + to_alloc - *prev_tail;
- free_bits = find_next_bit(buf, min(end, wend), wpos);
+ free_bits = find_next_bit_le(buf, min(end, wend), wpos);
free_len = *prev_tail + free_bits - wpos;
@@ -504,7 +504,6 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
u8 cluster_bits = sbi->cluster_bits;
u32 wbits = 8 * sb->s_blocksize;
u32 used, frb;
- const ulong *buf;
size_t wpos, wbit, iw, vbo;
struct buffer_head *bh = NULL;
CLST lcn, clen;
@@ -558,9 +557,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
goto out;
}
- buf = (ulong *)bh->b_data;
-
- used = bitmap_weight(buf, wbits);
+ used = ntfs_bitmap_weight_le(bh->b_data, wbits);
if (used < wbits) {
frb = wbits - used;
wnd->free_bits[iw] = frb;
@@ -574,7 +571,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
wbits = wnd->nbits - wbit;
do {
- used = find_next_zero_bit(buf, wbits, wpos);
+ used = find_next_zero_bit_le(bh->b_data, wbits, wpos);
if (used > wpos && prev_tail) {
wnd_add_free_ext(wnd, wbit + wpos - prev_tail,
@@ -590,7 +587,7 @@ static int wnd_rescan(struct wnd_bitmap *wnd)
break;
}
- frb = find_next_bit(buf, wbits, wpos);
+ frb = find_next_bit_le(bh->b_data, wbits, wpos);
if (frb >= wbits) {
/* Keep last free block. */
prev_tail += frb - wpos;
@@ -661,7 +658,7 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits)
if (!wnd->bits_last)
wnd->bits_last = wbits;
- wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS);
+ wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN);
if (!wnd->free_bits)
return -ENOMEM;
@@ -718,7 +715,6 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
while (iw < wnd->nwnd && bits) {
u32 tail, op;
- ulong *buf;
if (iw + 1 == wnd->nwnd)
wbits = wnd->bits_last;
@@ -732,11 +728,9 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
break;
}
- buf = (ulong *)bh->b_data;
-
lock_buffer(bh);
- __bitmap_clear(buf, wbit, op);
+ ntfs_bitmap_clear_le(bh->b_data, wbit, op);
wnd->free_bits[iw] += op;
@@ -771,7 +765,6 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
while (iw < wnd->nwnd && bits) {
u32 tail, op;
- ulong *buf;
if (unlikely(iw + 1 == wnd->nwnd))
wbits = wnd->bits_last;
@@ -784,11 +777,10 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
err = PTR_ERR(bh);
break;
}
- buf = (ulong *)bh->b_data;
lock_buffer(bh);
- __bitmap_set(buf, wbit, op);
+ ntfs_bitmap_set_le(bh->b_data, wbit, op);
wnd->free_bits[iw] -= op;
set_buffer_uptodate(bh);
@@ -809,6 +801,44 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
}
/*
+ * wnd_set_used_safe - Mark the bits range from bit to bit + bits as used.
+ *
+ * Unlikely wnd_set_used/wnd_set_free this function is not full trusted.
+ * It scans every bit in bitmap and marks free bit as used.
+ * @done - how many bits were marked as used.
+ *
+ * NOTE: normally *done should be 0.
+ */
+int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits,
+ size_t *done)
+{
+ size_t i, from = 0, len = 0;
+ int err = 0;
+
+ *done = 0;
+ for (i = 0; i < bits; i++) {
+ if (wnd_is_free(wnd, bit + i, 1)) {
+ if (!len)
+ from = bit + i;
+ len += 1;
+ } else if (len) {
+ err = wnd_set_used(wnd, from, len);
+ *done += len;
+ len = 0;
+ if (err)
+ break;
+ }
+ }
+
+ if (len) {
+ /* last fragment. */
+ err = wnd_set_used(wnd, from, len);
+ *done += len;
+ }
+ return err;
+}
+
+/*
* wnd_is_free_hlp
*
* Return: True if all clusters [bit, bit+bits) are free (bitmap only).
@@ -836,7 +866,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits)
if (IS_ERR(bh))
return false;
- ret = are_bits_clear((ulong *)bh->b_data, wbit, op);
+ ret = are_bits_clear(bh->b_data, wbit, op);
put_bh(bh);
if (!ret)
@@ -928,7 +958,7 @@ use_wnd:
if (IS_ERR(bh))
goto out;
- ret = are_bits_set((ulong *)bh->b_data, wbit, op);
+ ret = are_bits_set(bh->b_data, wbit, op);
put_bh(bh);
if (!ret)
goto out;
@@ -959,7 +989,6 @@ size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint,
size_t fnd, max_alloc, b_len, b_pos;
size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend;
size_t to_alloc0 = to_alloc;
- const ulong *buf;
const struct e_node *e;
const struct rb_node *pr, *cr;
u8 log2_bits;
@@ -1185,14 +1214,13 @@ Again:
continue;
}
- buf = (ulong *)bh->b_data;
-
/* Scan range [wbit, zbit). */
if (wpos < wzbit) {
/* Scan range [wpos, zbit). */
- fnd = wnd_scan(buf, wbit, wpos, wzbit,
- to_alloc, &prev_tail,
- &b_pos, &b_len);
+ fnd = wnd_scan(bh->b_data, wbit, wpos,
+ wzbit, to_alloc,
+ &prev_tail, &b_pos,
+ &b_len);
if (fnd != MINUS_ONE_T) {
put_bh(bh);
goto found;
@@ -1203,7 +1231,7 @@ Again:
/* Scan range [zend, ebit). */
if (wzend < wbits) {
- fnd = wnd_scan(buf, wbit,
+ fnd = wnd_scan(bh->b_data, wbit,
max(wzend, wpos), wbits,
to_alloc, &prev_tail,
&b_pos, &b_len);
@@ -1242,11 +1270,9 @@ Again:
continue;
}
- buf = (ulong *)bh->b_data;
-
/* Scan range [wpos, eBits). */
- fnd = wnd_scan(buf, wbit, wpos, wbits, to_alloc, &prev_tail,
- &b_pos, &b_len);
+ fnd = wnd_scan(bh->b_data, wbit, wpos, wbits, to_alloc,
+ &prev_tail, &b_pos, &b_len);
put_bh(bh);
if (fnd != MINUS_ONE_T)
goto found;
@@ -1324,7 +1350,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
new_last = wbits;
if (new_wnd != wnd->nwnd) {
- new_free = kmalloc(new_wnd * sizeof(u16), GFP_NOFS);
+ new_free = kmalloc_array(new_wnd, sizeof(u16), GFP_NOFS);
if (!new_free)
return -ENOMEM;
@@ -1344,7 +1370,6 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
size_t frb;
u64 vbo, lbo, bytes;
struct buffer_head *bh;
- ulong *buf;
if (iw + 1 == new_wnd)
wbits = new_last;
@@ -1361,10 +1386,9 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
return -EIO;
lock_buffer(bh);
- buf = (ulong *)bh->b_data;
- __bitmap_clear(buf, b0, blocksize * 8 - b0);
- frb = wbits - bitmap_weight(buf, wbits);
+ ntfs_bitmap_clear_le(bh->b_data, b0, blocksize * 8 - b0);
+ frb = wbits - ntfs_bitmap_weight_le(bh->b_data, wbits);
wnd->total_zeroes += frb - wnd->free_bits[iw];
wnd->free_bits[iw] = frb;
@@ -1411,7 +1435,6 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range)
CLST lcn_from = bytes_to_cluster(sbi, range->start);
size_t iw = lcn_from >> (sb->s_blocksize_bits + 3);
u32 wbit = lcn_from & (wbits - 1);
- const ulong *buf;
CLST lcn_to;
if (!minlen)
@@ -1424,7 +1447,7 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range)
down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
- for (; iw < wnd->nbits; iw++, wbit = 0) {
+ for (; iw < wnd->nwnd; iw++, wbit = 0) {
CLST lcn_wnd = iw * wbits;
struct buffer_head *bh;
@@ -1446,10 +1469,8 @@ int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range)
break;
}
- buf = (ulong *)bh->b_data;
-
for (; wbit < wbits; wbit++) {
- if (!test_bit(wbit, buf)) {
+ if (!test_bit_le(wbit, bh->b_data)) {
if (!len)
lcn = lcn_wnd + wbit;
len += 1;
@@ -1481,3 +1502,70 @@ out:
return err;
}
+
+#if BITS_PER_LONG == 64
+typedef __le64 bitmap_ulong;
+#define cpu_to_ul(x) cpu_to_le64(x)
+#define ul_to_cpu(x) le64_to_cpu(x)
+#else
+typedef __le32 bitmap_ulong;
+#define cpu_to_ul(x) cpu_to_le32(x)
+#define ul_to_cpu(x) le32_to_cpu(x)
+#endif
+
+void ntfs_bitmap_set_le(void *map, unsigned int start, int len)
+{
+ bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start);
+ const unsigned int size = start + len;
+ int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+ bitmap_ulong mask_to_set = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start));
+
+ while (len - bits_to_set >= 0) {
+ *p |= mask_to_set;
+ len -= bits_to_set;
+ bits_to_set = BITS_PER_LONG;
+ mask_to_set = cpu_to_ul(~0UL);
+ p++;
+ }
+ if (len) {
+ mask_to_set &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size));
+ *p |= mask_to_set;
+ }
+}
+
+void ntfs_bitmap_clear_le(void *map, unsigned int start, int len)
+{
+ bitmap_ulong *p = (bitmap_ulong *)map + BIT_WORD(start);
+ const unsigned int size = start + len;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ bitmap_ulong mask_to_clear = cpu_to_ul(BITMAP_FIRST_WORD_MASK(start));
+
+ while (len - bits_to_clear >= 0) {
+ *p &= ~mask_to_clear;
+ len -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = cpu_to_ul(~0UL);
+ p++;
+ }
+ if (len) {
+ mask_to_clear &= cpu_to_ul(BITMAP_LAST_WORD_MASK(size));
+ *p &= ~mask_to_clear;
+ }
+}
+
+unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits)
+{
+ const ulong *bmp = bitmap;
+ unsigned int k, lim = bits / BITS_PER_LONG;
+ unsigned int w = 0;
+
+ for (k = 0; k < lim; k++)
+ w += hweight_long(bmp[k]);
+
+ if (bits % BITS_PER_LONG) {
+ w += hweight_long(ul_to_cpu(((bitmap_ulong *)bitmap)[k]) &
+ BITMAP_LAST_WORD_MASK(bits));
+ }
+
+ return w;
+}
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index fb438d604040..063a6654199b 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -26,8 +26,8 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
if (!nls) {
/* UTF-16 -> UTF-8 */
- ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf,
- buf_len);
+ ret = utf16s_to_utf8s((wchar_t *)name, len, UTF16_LITTLE_ENDIAN,
+ buf, buf_len);
buf[ret] = '\0';
return ret;
}
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index c5e4a886593d..d294cd975688 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -122,31 +122,15 @@ static int ntfs_extend_initialized_size(struct file *file,
bits = sbi->cluster_bits;
vcn = pos >> bits;
- err = attr_data_get_block(ni, vcn, 0, &lcn, &clen,
- NULL);
+ err = attr_data_get_block(ni, vcn, 1, &lcn, &clen, NULL,
+ false);
if (err)
goto out;
if (lcn == SPARSE_LCN) {
- loff_t vbo = (loff_t)vcn << bits;
- loff_t to = vbo + ((loff_t)clen << bits);
-
- if (to <= new_valid) {
- ni->i_valid = to;
- pos = to;
- goto next;
- }
-
- if (vbo < pos) {
- pos = vbo;
- } else {
- to = (new_valid >> bits) << bits;
- if (pos < to) {
- ni->i_valid = to;
- pos = to;
- goto next;
- }
- }
+ pos = ((loff_t)clen + vcn) << bits;
+ ni->i_valid = pos;
+ goto next;
}
}
@@ -196,18 +180,18 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
struct address_space *mapping = inode->i_mapping;
u32 blocksize = 1 << inode->i_blkbits;
pgoff_t idx = vbo >> PAGE_SHIFT;
- u32 z_start = vbo & (PAGE_SIZE - 1);
+ u32 from = vbo & (PAGE_SIZE - 1);
pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT;
loff_t page_off;
struct buffer_head *head, *bh;
- u32 bh_next, bh_off, z_end;
+ u32 bh_next, bh_off, to;
sector_t iblock;
struct page *page;
- for (; idx < idx_end; idx += 1, z_start = 0) {
+ for (; idx < idx_end; idx += 1, from = 0) {
page_off = (loff_t)idx << PAGE_SHIFT;
- z_end = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
- : PAGE_SIZE;
+ to = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
+ : PAGE_SIZE;
iblock = page_off >> inode->i_blkbits;
page = find_or_create_page(mapping, idx,
@@ -224,7 +208,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
do {
bh_next = bh_off + blocksize;
- if (bh_next <= z_start || bh_off >= z_end)
+ if (bh_next <= from || bh_off >= to)
continue;
if (!buffer_mapped(bh)) {
@@ -258,7 +242,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
} while (bh_off = bh_next, iblock += 1,
head != (bh = bh->b_this_page));
- zero_user_segment(page, z_start, z_end);
+ zero_user_segment(page, from, to);
unlock_page(page);
put_page(page);
@@ -270,81 +254,6 @@ out:
}
/*
- * ntfs_sparse_cluster - Helper function to zero a new allocated clusters.
- *
- * NOTE: 512 <= cluster size <= 2M
- */
-void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
- CLST len)
-{
- struct address_space *mapping = inode->i_mapping;
- struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
- u64 vbo = (u64)vcn << sbi->cluster_bits;
- u64 bytes = (u64)len << sbi->cluster_bits;
- u32 blocksize = 1 << inode->i_blkbits;
- pgoff_t idx0 = page0 ? page0->index : -1;
- loff_t vbo_clst = vbo & sbi->cluster_mask_inv;
- loff_t end = ntfs_up_cluster(sbi, vbo + bytes);
- pgoff_t idx = vbo_clst >> PAGE_SHIFT;
- u32 from = vbo_clst & (PAGE_SIZE - 1);
- pgoff_t idx_end = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
- loff_t page_off;
- u32 to;
- bool partial;
- struct page *page;
-
- for (; idx < idx_end; idx += 1, from = 0) {
- page = idx == idx0 ? page0 : grab_cache_page(mapping, idx);
-
- if (!page)
- continue;
-
- page_off = (loff_t)idx << PAGE_SHIFT;
- to = (page_off + PAGE_SIZE) > end ? (end - page_off)
- : PAGE_SIZE;
- partial = false;
-
- if ((from || PAGE_SIZE != to) &&
- likely(!page_has_buffers(page))) {
- create_empty_buffers(page, blocksize, 0);
- }
-
- if (page_has_buffers(page)) {
- struct buffer_head *head, *bh;
- u32 bh_off = 0;
-
- bh = head = page_buffers(page);
- do {
- u32 bh_next = bh_off + blocksize;
-
- if (from <= bh_off && bh_next <= to) {
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- } else if (!buffer_uptodate(bh)) {
- partial = true;
- }
- bh_off = bh_next;
- } while (head != (bh = bh->b_this_page));
- }
-
- zero_user_segment(page, from, to);
-
- if (!partial) {
- if (!PageUptodate(page))
- SetPageUptodate(page);
- set_page_dirty(page);
- }
-
- if (idx != idx0) {
- unlock_page(page);
- put_page(page);
- }
- cond_resched();
- }
- mark_inode_dirty(inode);
-}
-
-/*
* ntfs_file_mmap - file_operations::mmap
*/
static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -385,13 +294,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
for (; vcn < end; vcn += len) {
err = attr_data_get_block(ni, vcn, 1, &lcn,
- &len, &new);
+ &len, &new, true);
if (err)
goto out;
-
- if (!new)
- continue;
- ntfs_sparse_cluster(inode, NULL, vcn, 1);
}
}
@@ -432,7 +337,6 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
err = ntfs_set_size(inode, end);
if (err)
goto out;
- inode->i_size = end;
}
if (extend_init && !is_compressed(ni)) {
@@ -486,10 +390,10 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size)
new_valid = ntfs_up_block(sb, min_t(u64, ni->i_valid, new_size));
- ni_lock(ni);
-
truncate_setsize(inode, new_size);
+ ni_lock(ni);
+
down_write(&ni->file.run_lock);
err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size,
&new_valid, ni->mi.sbi->options->prealloc, NULL);
@@ -535,7 +439,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
struct ntfs_sb_info *sbi = sb->s_fs_info;
struct ntfs_inode *ni = ntfs_i(inode);
loff_t end = vbo + len;
- loff_t vbo_down = round_down(vbo, PAGE_SIZE);
+ loff_t vbo_down = round_down(vbo, max_t(unsigned long,
+ sbi->cluster_size, PAGE_SIZE));
bool is_supported_holes = is_sparsed(ni) || is_compressed(ni);
loff_t i_size, new_size;
bool map_locked;
@@ -588,11 +493,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
u32 frame_size;
loff_t mask, vbo_a, end_a, tmp;
- err = filemap_write_and_wait_range(mapping, vbo, end - 1);
- if (err)
- goto out;
-
- err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
+ err = filemap_write_and_wait_range(mapping, vbo_down,
+ LLONG_MAX);
if (err)
goto out;
@@ -685,47 +587,45 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
if (err)
goto out;
- /*
- * Allocate clusters, do not change 'valid' size.
- */
- err = ntfs_set_size(inode, new_size);
- if (err)
- goto out;
+ if (new_size > i_size) {
+ /*
+ * Allocate clusters, do not change 'valid' size.
+ */
+ err = ntfs_set_size(inode, new_size);
+ if (err)
+ goto out;
+ }
if (is_supported_holes) {
- CLST vcn_v = ni->i_valid >> sbi->cluster_bits;
CLST vcn = vbo >> sbi->cluster_bits;
CLST cend = bytes_to_cluster(sbi, end);
+ CLST cend_v = bytes_to_cluster(sbi, ni->i_valid);
CLST lcn, clen;
bool new;
+ if (cend_v > cend)
+ cend_v = cend;
+
/*
- * Allocate but do not zero new clusters. (see below comments)
- * This breaks security: One can read unused on-disk areas.
+ * Allocate and zero new clusters.
* Zeroing these clusters may be too long.
- * Maybe we should check here for root rights?
+ */
+ for (; vcn < cend_v; vcn += clen) {
+ err = attr_data_get_block(ni, vcn, cend_v - vcn,
+ &lcn, &clen, &new,
+ true);
+ if (err)
+ goto out;
+ }
+ /*
+ * Allocate but not zero new clusters.
*/
for (; vcn < cend; vcn += clen) {
err = attr_data_get_block(ni, vcn, cend - vcn,
- &lcn, &clen, &new);
+ &lcn, &clen, &new,
+ false);
if (err)
goto out;
- if (!new || vcn >= vcn_v)
- continue;
-
- /*
- * Unwritten area.
- * NTFS is not able to store several unwritten areas.
- * Activate 'ntfs_sparse_cluster' to zero new allocated clusters.
- *
- * Dangerous in case:
- * 1G of sparsed clusters + 1 cluster of data =>
- * valid_size == 1G + 1 cluster
- * fallocate(1G) will zero 1G and this can be very long
- * xfstest 016/086 will fail without 'ntfs_sparse_cluster'.
- */
- ntfs_sparse_cluster(inode, NULL, vcn,
- min(vcn_v - vcn, clen));
}
}
@@ -736,6 +636,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
&ni->file.run, i_size, &ni->i_valid,
true, NULL);
ni_unlock(ni);
+ } else if (new_size > i_size) {
+ inode->i_size = new_size;
}
}
@@ -779,7 +681,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
goto out;
if (ia_valid & ATTR_SIZE) {
- loff_t oldsize = inode->i_size;
+ loff_t newsize, oldsize;
if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
/* Should never be here, see ntfs_file_open(). */
@@ -787,16 +689,19 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
goto out;
}
inode_dio_wait(inode);
+ oldsize = inode->i_size;
+ newsize = attr->ia_size;
- if (attr->ia_size <= oldsize)
- err = ntfs_truncate(inode, attr->ia_size);
- else if (attr->ia_size > oldsize)
- err = ntfs_extend(inode, attr->ia_size, 0, NULL);
+ if (newsize <= oldsize)
+ err = ntfs_truncate(inode, newsize);
+ else
+ err = ntfs_extend(inode, newsize, 0, NULL);
if (err)
goto out;
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ inode->i_size = newsize;
}
setattr_copy(mnt_userns, inode, attr);
@@ -946,8 +851,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
frame_vbo = valid & ~(frame_size - 1);
off = valid & (frame_size - 1);
- err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 0, &lcn,
- &clen, NULL);
+ err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 1, &lcn,
+ &clen, NULL, false);
if (err)
goto out;
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 381a38a06ec2..f1df52dfab74 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -557,7 +557,7 @@ static int ni_repack(struct ntfs_inode *ni)
}
if (!mi_p) {
- /* Do not try if not enogh free space. */
+ /* Do not try if not enough free space. */
if (le32_to_cpu(mi->mrec->used) + 8 >= rs)
continue;
@@ -568,6 +568,12 @@ static int ni_repack(struct ntfs_inode *ni)
}
roff = le16_to_cpu(attr->nres.run_off);
+
+ if (roff > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ break;
+ }
+
err = run_unpack(&run, sbi, ni->mi.rno, svcn, evcn, svcn,
Add2Ptr(attr, roff),
le32_to_cpu(attr->size) - roff);
@@ -1589,6 +1595,9 @@ int ni_delete_all(struct ntfs_inode *ni)
asize = le32_to_cpu(attr->size);
roff = le16_to_cpu(attr->nres.run_off);
+ if (roff > asize)
+ return -EINVAL;
+
/* run==1 means unpack and deallocate. */
run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn,
Add2Ptr(attr, roff), asize - roff);
@@ -1636,6 +1645,7 @@ struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
{
struct ATTRIB *attr = NULL;
struct ATTR_FILE_NAME *fname;
+ struct le_str *fns;
if (le)
*le = NULL;
@@ -1659,8 +1669,8 @@ next:
if (uni->len != fname->name_len)
goto next;
- if (ntfs_cmp_names_cpu(uni, (struct le_str *)&fname->name_len, NULL,
- false))
+ fns = (struct le_str *)&fname->name_len;
+ if (ntfs_cmp_names_cpu(uni, fns, NULL, false))
goto next;
return fname;
@@ -2214,7 +2224,7 @@ int ni_decompress_file(struct ntfs_inode *ni)
for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) {
err = attr_data_get_block(ni, vcn, cend - vcn, &lcn,
- &clen, &new);
+ &clen, &new, false);
if (err)
goto out;
}
@@ -2291,6 +2301,11 @@ remove_wof:
asize = le32_to_cpu(attr->size);
roff = le16_to_cpu(attr->nres.run_off);
+ if (roff > asize) {
+ err = -EINVAL;
+ goto out;
+ }
+
/*run==1 Means unpack and deallocate. */
run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn,
Add2Ptr(attr, roff), asize - roff);
@@ -2997,6 +3012,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct NTFS_DE *de)
{
int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTRIB *attr;
struct ATTR_LIST_ENTRY *le;
struct mft_inode *mi;
@@ -3004,6 +3020,19 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1);
u16 de_key_size = le16_to_cpu(de->key_size);
+ if (sbi->options->windows_names &&
+ !valid_windows_name(sbi, (struct le_str *)&de_name->name_len))
+ return -EINVAL;
+
+ /* If option "hide_dot_files" then set hidden attribute for dot files. */
+ if (ni->mi.sbi->options->hide_dot_files) {
+ if (de_name->name_len > 0 &&
+ le16_to_cpu(de_name->name[0]) == '.')
+ ni->std_fa |= FILE_ATTRIBUTE_HIDDEN;
+ else
+ ni->std_fa &= ~FILE_ATTRIBUTE_HIDDEN;
+ }
+
mi_get_ref(&ni->mi, &de->ref);
mi_get_ref(&dir_ni->mi, &de_name->home);
@@ -3022,7 +3051,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size);
/* Insert new name into directory. */
- err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0);
+ err = indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 0);
if (err)
ni_remove_attr_le(ni, attr, mi, le);
@@ -3265,6 +3294,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
modified = true;
}
+ /* std attribute is always in primary MFT record. */
if (modified)
ni->mi.dirty = true;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 0d611a6c5511..c6eb371a3695 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -1132,7 +1132,7 @@ static int read_log_page(struct ntfs_log *log, u32 vbo,
return -EINVAL;
if (!*buffer) {
- to_free = kmalloc(bytes, GFP_NOFS);
+ to_free = kmalloc(log->page_size, GFP_NOFS);
if (!to_free)
return -ENOMEM;
*buffer = to_free;
@@ -1180,10 +1180,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
struct restart_info *info)
{
u32 skip, vbo;
- struct RESTART_HDR *r_page = kmalloc(DefaultLogPageSize, GFP_NOFS);
-
- if (!r_page)
- return -ENOMEM;
+ struct RESTART_HDR *r_page = NULL;
/* Determine which restart area we are looking for. */
if (first) {
@@ -1197,7 +1194,6 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
/* Loop continuously until we succeed. */
for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) {
bool usa_error;
- u32 sys_page_size;
bool brst, bchk;
struct RESTART_AREA *ra;
@@ -1251,24 +1247,6 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
goto check_result;
}
- /* Read the entire restart area. */
- sys_page_size = le32_to_cpu(r_page->sys_page_size);
- if (DefaultLogPageSize != sys_page_size) {
- kfree(r_page);
- r_page = kzalloc(sys_page_size, GFP_NOFS);
- if (!r_page)
- return -ENOMEM;
-
- if (read_log_page(log, vbo,
- (struct RECORD_PAGE_HDR **)&r_page,
- &usa_error)) {
- /* Ignore any errors. */
- kfree(r_page);
- r_page = NULL;
- continue;
- }
- }
-
if (is_client_area_valid(r_page, usa_error)) {
info->valid_page = true;
ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off));
@@ -2727,6 +2705,9 @@ static inline bool check_attr(const struct MFT_REC *rec,
return false;
}
+ if (run_off > asize)
+ return false;
+
if (run_unpack(NULL, sbi, 0, svcn, evcn, svcn,
Add2Ptr(attr, run_off), asize - run_off) < 0) {
return false;
@@ -3048,7 +3029,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
struct NEW_ATTRIBUTE_SIZES *new_sz;
struct ATTR_FILE_NAME *fname;
struct OpenAttr *oa, *oa2;
- u32 nsize, t32, asize, used, esize, bmp_off, bmp_bits;
+ u32 nsize, t32, asize, used, esize, off, bits;
u16 id, id2;
u32 record_size = sbi->record_size;
u64 t64;
@@ -3635,30 +3616,28 @@ move_data:
break;
case SetBitsInNonresidentBitMap:
- bmp_off =
- le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
- bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
+ off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
+ bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
- if (cbo + (bmp_off + 7) / 8 > lco ||
- cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) {
+ if (cbo + (off + 7) / 8 > lco ||
+ cbo + ((off + bits + 7) / 8) > lco) {
goto dirty_vol;
}
- __bitmap_set(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits);
+ ntfs_bitmap_set_le(Add2Ptr(buffer_le, roff), off, bits);
a_dirty = true;
break;
case ClearBitsInNonresidentBitMap:
- bmp_off =
- le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
- bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
+ off = le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
+ bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
- if (cbo + (bmp_off + 7) / 8 > lco ||
- cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) {
+ if (cbo + (off + 7) / 8 > lco ||
+ cbo + ((off + bits + 7) / 8) > lco) {
goto dirty_vol;
}
- __bitmap_clear(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits);
+ ntfs_bitmap_clear_le(Add2Ptr(buffer_le, roff), off, bits);
a_dirty = true;
break;
@@ -4771,6 +4750,12 @@ fake_attr:
u16 roff = le16_to_cpu(attr->nres.run_off);
CLST svcn = le64_to_cpu(attr->nres.svcn);
+ if (roff > t32) {
+ kfree(oa->attr);
+ oa->attr = NULL;
+ goto fake_attr;
+ }
+
err = run_unpack(&oa->run0, sbi, inode->i_ino, svcn,
le64_to_cpu(attr->nres.evcn), svcn,
Add2Ptr(attr, roff), t32 - roff);
@@ -4839,8 +4824,7 @@ next_dirty_page_vcn:
goto out;
}
attr = oa->attr;
- t64 = le64_to_cpu(attr->nres.alloc_size);
- if (size > t64) {
+ if (size > le64_to_cpu(attr->nres.alloc_size)) {
attr->nres.valid_size = attr->nres.data_size =
attr->nres.alloc_size = cpu_to_le64(size);
}
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 4ed15f64b17f..567563771bf8 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -98,6 +98,30 @@ const __le16 WOF_NAME[17] = {
};
#endif
+static const __le16 CON_NAME[3] = {
+ cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('N'),
+};
+
+static const __le16 NUL_NAME[3] = {
+ cpu_to_le16('N'), cpu_to_le16('U'), cpu_to_le16('L'),
+};
+
+static const __le16 AUX_NAME[3] = {
+ cpu_to_le16('A'), cpu_to_le16('U'), cpu_to_le16('X'),
+};
+
+static const __le16 PRN_NAME[3] = {
+ cpu_to_le16('P'), cpu_to_le16('R'), cpu_to_le16('N'),
+};
+
+static const __le16 COM_NAME[3] = {
+ cpu_to_le16('C'), cpu_to_le16('O'), cpu_to_le16('M'),
+};
+
+static const __le16 LPT_NAME[3] = {
+ cpu_to_le16('L'), cpu_to_le16('P'), cpu_to_le16('T'),
+};
+
// clang-format on
/*
@@ -322,35 +346,6 @@ out:
}
/*
- * ntfs_query_def
- *
- * Return: Current ATTR_DEF_ENTRY for given attribute type.
- */
-const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi,
- enum ATTR_TYPE type)
-{
- int type_in = le32_to_cpu(type);
- size_t min_idx = 0;
- size_t max_idx = sbi->def_entries - 1;
-
- while (min_idx <= max_idx) {
- size_t i = min_idx + ((max_idx - min_idx) >> 1);
- const struct ATTR_DEF_ENTRY *entry = sbi->def_table + i;
- int diff = le32_to_cpu(entry->type) - type_in;
-
- if (!diff)
- return entry;
- if (diff < 0)
- min_idx = i + 1;
- else if (i)
- max_idx = i - 1;
- else
- return NULL;
- }
- return NULL;
-}
-
-/*
* ntfs_look_for_free_space - Look for a free space in bitmap.
*/
int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
@@ -449,6 +444,39 @@ up_write:
}
/*
+ * ntfs_check_for_free_space
+ *
+ * Check if it is possible to allocate 'clen' clusters and 'mlen' Mft records
+ */
+bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen)
+{
+ size_t free, zlen, avail;
+ struct wnd_bitmap *wnd;
+
+ wnd = &sbi->used.bitmap;
+ down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
+ free = wnd_zeroes(wnd);
+ zlen = min_t(size_t, NTFS_MIN_MFT_ZONE, wnd_zone_len(wnd));
+ up_read(&wnd->rw_lock);
+
+ if (free < zlen + clen)
+ return false;
+
+ avail = free - (zlen + clen);
+
+ wnd = &sbi->mft.bitmap;
+ down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
+ free = wnd_zeroes(wnd);
+ zlen = wnd_zone_len(wnd);
+ up_read(&wnd->rw_lock);
+
+ if (free >= zlen + mlen)
+ return true;
+
+ return avail >= bytes_to_cluster(sbi, mlen << sbi->record_bits);
+}
+
+/*
* ntfs_extend_mft - Allocate additional MFT records.
*
* sbi->mft.bitmap is locked for write.
@@ -475,7 +503,7 @@ static int ntfs_extend_mft(struct ntfs_sb_info *sbi)
struct ATTRIB *attr;
struct wnd_bitmap *wnd = &sbi->mft.bitmap;
- new_mft_total = (wnd->nbits + MFT_INCREASE_CHUNK + 127) & (CLST)~127;
+ new_mft_total = ALIGN(wnd->nbits + NTFS_MFT_INCREASE_STEP, 128);
new_mft_bytes = (u64)new_mft_total << sbi->record_bits;
/* Step 1: Resize $MFT::DATA. */
@@ -618,13 +646,13 @@ next:
NULL, 0, NULL, NULL))
goto next;
- __clear_bit(ir - MFT_REC_RESERVED,
+ __clear_bit_le(ir - MFT_REC_RESERVED,
&sbi->mft.reserved_bitmap);
}
}
/* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */
- zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap,
+ zbit = find_next_zero_bit_le(&sbi->mft.reserved_bitmap,
MFT_REC_FREE, MFT_REC_RESERVED);
if (zbit >= MFT_REC_FREE) {
sbi->mft.next_reserved = MFT_REC_FREE;
@@ -692,7 +720,7 @@ found:
if (*rno >= MFT_REC_FREE)
wnd_set_used(wnd, *rno, 1);
else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited)
- __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+ __set_bit_le(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
out:
if (!mft)
@@ -720,7 +748,7 @@ void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft)
else
wnd_set_free(wnd, rno, 1);
} else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) {
- __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+ __clear_bit_le(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
}
if (rno < wnd_zone_bit(wnd))
@@ -830,7 +858,6 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
if (!(sbi->flags & NTFS_FLAGS_MFTMIRR))
return;
- err = 0;
bytes = sbi->mft.recs_mirr << sbi->record_bits;
block1 = sbi->mft.lbo >> sb->s_blocksize_bits;
block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits;
@@ -860,8 +887,7 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
put_bh(bh1);
bh1 = NULL;
- if (wait)
- err = sync_dirty_buffer(bh2);
+ err = wait ? sync_dirty_buffer(bh2) : 0;
put_bh(bh2);
if (err)
@@ -1849,9 +1875,10 @@ int ntfs_security_init(struct ntfs_sb_info *sbi)
goto out;
}
- root_sdh = resident_data(attr);
+ root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
if (root_sdh->type != ATTR_ZERO ||
- root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH) {
+ root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH ||
+ offsetof(struct INDEX_ROOT, ihdr) + root_sdh->ihdr.used > attr->res.data_size) {
err = -EINVAL;
goto out;
}
@@ -1867,9 +1894,10 @@ int ntfs_security_init(struct ntfs_sb_info *sbi)
goto out;
}
- root_sii = resident_data(attr);
+ root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
if (root_sii->type != ATTR_ZERO ||
- root_sii->rule != NTFS_COLLATION_TYPE_UINT) {
+ root_sii->rule != NTFS_COLLATION_TYPE_UINT ||
+ offsetof(struct INDEX_ROOT, ihdr) + root_sii->ihdr.used > attr->res.data_size) {
err = -EINVAL;
goto out;
}
@@ -2502,3 +2530,83 @@ int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim)
return 0;
}
+
+static inline bool name_has_forbidden_chars(const struct le_str *fname)
+{
+ int i, ch;
+
+ /* check for forbidden chars */
+ for (i = 0; i < fname->len; ++i) {
+ ch = le16_to_cpu(fname->name[i]);
+
+ /* control chars */
+ if (ch < 0x20)
+ return true;
+
+ switch (ch) {
+ /* disallowed by Windows */
+ case '\\':
+ case '/':
+ case ':':
+ case '*':
+ case '?':
+ case '<':
+ case '>':
+ case '|':
+ case '\"':
+ return true;
+
+ default:
+ /* allowed char */
+ break;
+ }
+ }
+
+ /* file names cannot end with space or . */
+ if (fname->len > 0) {
+ ch = le16_to_cpu(fname->name[fname->len - 1]);
+ if (ch == ' ' || ch == '.')
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool is_reserved_name(struct ntfs_sb_info *sbi,
+ const struct le_str *fname)
+{
+ int port_digit;
+ const __le16 *name = fname->name;
+ int len = fname->len;
+ u16 *upcase = sbi->upcase;
+
+ /* check for 3 chars reserved names (device names) */
+ /* name by itself or with any extension is forbidden */
+ if (len == 3 || (len > 3 && le16_to_cpu(name[3]) == '.'))
+ if (!ntfs_cmp_names(name, 3, CON_NAME, 3, upcase, false) ||
+ !ntfs_cmp_names(name, 3, NUL_NAME, 3, upcase, false) ||
+ !ntfs_cmp_names(name, 3, AUX_NAME, 3, upcase, false) ||
+ !ntfs_cmp_names(name, 3, PRN_NAME, 3, upcase, false))
+ return true;
+
+ /* check for 4 chars reserved names (port name followed by 1..9) */
+ /* name by itself or with any extension is forbidden */
+ if (len == 4 || (len > 4 && le16_to_cpu(name[4]) == '.')) {
+ port_digit = le16_to_cpu(name[3]);
+ if (port_digit >= '1' && port_digit <= '9')
+ if (!ntfs_cmp_names(name, 3, COM_NAME, 3, upcase, false) ||
+ !ntfs_cmp_names(name, 3, LPT_NAME, 3, upcase, false))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * valid_windows_name - Check if a file name is valid in Windows.
+ */
+bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *fname)
+{
+ return !name_has_forbidden_chars(fname) &&
+ !is_reserved_name(sbi, fname);
+}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 440328147e7e..51ab75954640 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -47,7 +47,7 @@ static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2,
if (l2 < fsize2)
return -1;
- both_case = f2->type != FILE_NAME_DOS /*&& !sbi->options.nocase*/;
+ both_case = f2->type != FILE_NAME_DOS && !sbi->options->nocase;
if (!l1) {
const struct le_str *s2 = (struct le_str *)&f2->name_len;
@@ -323,7 +323,7 @@ static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni,
if (err)
return err;
- __set_bit(bit - bbuf.bit, bbuf.buf);
+ __set_bit_le(bit - bbuf.bit, bbuf.buf);
bmp_buf_put(&bbuf, true);
@@ -343,7 +343,7 @@ static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni,
if (err)
return err;
- __clear_bit(bit - bbuf.bit, bbuf.buf);
+ __clear_bit_le(bit - bbuf.bit, bbuf.buf);
bmp_buf_put(&bbuf, true);
@@ -457,7 +457,7 @@ next_run:
static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret)
{
- size_t pos = find_next_zero_bit(buf, bits, bit);
+ size_t pos = find_next_zero_bit_le(buf, bits, bit);
if (pos >= bits)
return false;
@@ -489,7 +489,7 @@ static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni,
if (!b->non_res) {
u32 nbits = 8 * le32_to_cpu(b->res.data_size);
- size_t pos = find_next_zero_bit(resident_data(b), nbits, 0);
+ size_t pos = find_next_zero_bit_le(resident_data(b), nbits, 0);
if (pos < nbits)
*bit = pos;
@@ -505,7 +505,7 @@ static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni,
static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret)
{
- size_t pos = find_next_bit(buf, bits, bit);
+ size_t pos = find_next_bit_le(buf, bits, bit);
if (pos >= bits)
return false;
@@ -536,7 +536,7 @@ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit)
if (!b->non_res) {
u32 nbits = le32_to_cpu(b->res.data_size) * 8;
- size_t pos = find_next_bit(resident_data(b), nbits, from);
+ size_t pos = find_next_bit_le(resident_data(b), nbits, from);
if (pos < nbits)
*bit = pos;
@@ -605,11 +605,58 @@ static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr,
return e;
}
+/*
+ * index_hdr_check
+ *
+ * return true if INDEX_HDR is valid
+ */
+static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes)
+{
+ u32 end = le32_to_cpu(hdr->used);
+ u32 tot = le32_to_cpu(hdr->total);
+ u32 off = le32_to_cpu(hdr->de_off);
+
+ if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot ||
+ off + sizeof(struct NTFS_DE) > end) {
+ /* incorrect index buffer. */
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * index_buf_check
+ *
+ * return true if INDEX_BUFFER seems is valid
+ */
+static bool index_buf_check(const struct INDEX_BUFFER *ib, u32 bytes,
+ const CLST *vbn)
+{
+ const struct NTFS_RECORD_HEADER *rhdr = &ib->rhdr;
+ u16 fo = le16_to_cpu(rhdr->fix_off);
+ u16 fn = le16_to_cpu(rhdr->fix_num);
+
+ if (bytes <= offsetof(struct INDEX_BUFFER, ihdr) ||
+ rhdr->sign != NTFS_INDX_SIGNATURE ||
+ fo < sizeof(struct INDEX_BUFFER)
+ /* Check index buffer vbn. */
+ || (vbn && *vbn != le64_to_cpu(ib->vbn)) || (fo % sizeof(short)) ||
+ fo + fn * sizeof(short) >= bytes ||
+ fn != ((bytes >> SECTOR_SHIFT) + 1)) {
+ /* incorrect index buffer. */
+ return false;
+ }
+
+ return index_hdr_check(&ib->ihdr,
+ bytes - offsetof(struct INDEX_BUFFER, ihdr));
+}
+
void fnd_clear(struct ntfs_fnd *fnd)
{
int i;
- for (i = 0; i < fnd->level; i++) {
+ for (i = fnd->level - 1; i >= 0; i--) {
struct indx_node *n = fnd->nodes[i];
if (!n)
@@ -625,9 +672,8 @@ void fnd_clear(struct ntfs_fnd *fnd)
static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n,
struct NTFS_DE *e)
{
- int i;
+ int i = fnd->level;
- i = fnd->level;
if (i < 0 || i >= ARRAY_SIZE(fnd->nodes))
return -EINVAL;
fnd->nodes[i] = n;
@@ -820,9 +866,16 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
u32 t32;
const struct INDEX_ROOT *root = resident_data(attr);
+ t32 = le32_to_cpu(attr->res.data_size);
+ if (t32 <= offsetof(struct INDEX_ROOT, ihdr) ||
+ !index_hdr_check(&root->ihdr,
+ t32 - offsetof(struct INDEX_ROOT, ihdr))) {
+ goto out;
+ }
+
/* Check root fields. */
if (!root->index_block_clst)
- return -EINVAL;
+ goto out;
indx->type = type;
indx->idx2vbn_bits = __ffs(root->index_block_clst);
@@ -834,19 +887,19 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
if (t32 < sbi->cluster_size) {
/* Index record is smaller than a cluster, use 512 blocks. */
if (t32 != root->index_block_clst * SECTOR_SIZE)
- return -EINVAL;
+ goto out;
/* Check alignment to a cluster. */
if ((sbi->cluster_size >> SECTOR_SHIFT) &
(root->index_block_clst - 1)) {
- return -EINVAL;
+ goto out;
}
indx->vbn2vbo_bits = SECTOR_SHIFT;
} else {
/* Index record must be a multiple of cluster size. */
if (t32 != root->index_block_clst << sbi->cluster_bits)
- return -EINVAL;
+ goto out;
indx->vbn2vbo_bits = sbi->cluster_bits;
}
@@ -854,7 +907,14 @@ int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
init_rwsem(&indx->run_lock);
indx->cmp = get_cmp_func(root);
- return indx->cmp ? 0 : -EINVAL;
+ if (!indx->cmp)
+ goto out;
+
+ return 0;
+
+out:
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
+ return -EINVAL;
}
static struct indx_node *indx_new(struct ntfs_index *indx,
@@ -1012,11 +1072,24 @@ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn,
goto out;
ok:
+ if (!index_buf_check(ib, bytes, &vbn)) {
+ ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
+ ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ err = -EINVAL;
+ goto out;
+ }
+
if (err == -E_NTFS_FIXUP) {
ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0);
err = 0;
}
+ /* check for index header length */
+ if (offsetof(struct INDEX_BUFFER, ihdr) + ib->ihdr.used > bytes) {
+ err = -EINVAL;
+ goto out;
+ }
+
in->index = ib;
*node = in;
@@ -1341,8 +1414,8 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
run_init(&run);
- err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, 0, &alen, 0,
- NULL);
+ err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, ALLOCATE_DEF,
+ &alen, 0, NULL, NULL);
if (err)
goto out;
@@ -1440,6 +1513,9 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
goto out1;
}
+ if (in->name == I30_NAME)
+ ni->vfs_inode.i_size = data_size;
+
*vbn = bit << indx->idx2vbn_bits;
return 0;
@@ -1593,9 +1669,9 @@ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni,
if (err) {
/* Restore root. */
- if (mi_resize_attr(mi, attr, -ds_root))
+ if (mi_resize_attr(mi, attr, -ds_root)) {
memcpy(attr, a_root, asize);
- else {
+ } else {
/* Bug? */
ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
}
@@ -1947,7 +2023,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
if (bit >= nbits)
return 0;
- pos = find_next_bit(bm, nbits, bit);
+ pos = find_next_bit_le(bm, nbits, bit);
if (pos < nbits)
return 0;
} else {
@@ -1973,6 +2049,9 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
if (err)
return err;
+ if (in->name == I30_NAME)
+ ni->vfs_inode.i_size = new_data;
+
bpb = bitmap_size(bit);
if (bpb * 8 == nbits)
return 0;
@@ -2115,9 +2194,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx,
fnd->de[level] = e;
indx_write(indx, ni, n, 0);
- /* Check to see if this action created an empty leaf. */
- if (ib_is_leaf(ib) && ib_is_empty(ib))
+ if (ib_is_leaf(ib) && ib_is_empty(ib)) {
+ /* An empty leaf. */
return 0;
+ }
out:
fnd_clear(fnd);
@@ -2455,6 +2535,9 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
&indx->alloc_run, 0, NULL, false, NULL);
+ if (in->name == I30_NAME)
+ ni->vfs_inode.i_size = 0;
+
err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len,
false, NULL);
run_close(&indx->alloc_run);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index d5a3afbbbfd8..20b953871574 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -81,7 +81,7 @@ static struct inode *ntfs_read_mft(struct inode *inode,
le16_to_cpu(ref->seq), le16_to_cpu(rec->seq));
goto out;
} else if (!is_rec_inuse(rec)) {
- err = -EINVAL;
+ err = -ESTALE;
ntfs_err(sb, "Inode r=%x is not in use!", (u32)ino);
goto out;
}
@@ -92,8 +92,10 @@ static struct inode *ntfs_read_mft(struct inode *inode,
goto out;
}
- if (!is_rec_base(rec))
- goto Ok;
+ if (!is_rec_base(rec)) {
+ err = -EINVAL;
+ goto out;
+ }
/* Record should contain $I30 root. */
is_dir = rec->flags & RECORD_FLAG_DIR;
@@ -129,6 +131,16 @@ next_attr:
rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size);
asize = le32_to_cpu(attr->size);
+ if (le16_to_cpu(attr->name_off) + attr->name_len > asize)
+ goto out;
+
+ if (attr->non_res) {
+ t64 = le64_to_cpu(attr->nres.alloc_size);
+ if (le64_to_cpu(attr->nres.data_size) > t64 ||
+ le64_to_cpu(attr->nres.valid_size) > t64)
+ goto out;
+ }
+
switch (attr->type) {
case ATTR_STD:
if (attr->non_res ||
@@ -364,7 +376,13 @@ next_attr:
attr_unpack_run:
roff = le16_to_cpu(attr->nres.run_off);
+ if (roff > asize) {
+ err = -EINVAL;
+ goto out;
+ }
+
t64 = le64_to_cpu(attr->nres.svcn);
+
err = run_unpack_ex(run, sbi, ino, t64, le64_to_cpu(attr->nres.evcn),
t64, Add2Ptr(attr, roff), asize - roff);
if (err < 0)
@@ -450,7 +468,6 @@ end_enum:
inode->i_flags |= S_NOSEC;
}
-Ok:
if (ino == MFT_REC_MFT && !sb->s_root)
sbi->mft.ni = NULL;
@@ -504,6 +521,9 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
_ntfs_bad_inode(inode);
}
+ if (IS_ERR(inode) && name)
+ ntfs_set_state(sb->s_fs_info, NTFS_DIRTY_ERROR);
+
return inode;
}
@@ -535,17 +555,6 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
clear_buffer_new(bh);
clear_buffer_uptodate(bh);
- /* Direct write uses 'create=0'. */
- if (!create && vbo >= ni->i_valid) {
- /* Out of valid. */
- return 0;
- }
-
- if (vbo >= inode->i_size) {
- /* Out of size. */
- return 0;
- }
-
if (is_resident(ni)) {
ni_lock(ni);
err = attr_data_read_resident(ni, page);
@@ -561,7 +570,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
off = vbo & sbi->cluster_mask;
new = false;
- err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL);
+ err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL,
+ create && sbi->cluster_size > PAGE_SIZE);
if (err)
goto out;
@@ -579,11 +589,8 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
WARN_ON(1);
}
- if (new) {
+ if (new)
set_buffer_new(bh);
- if ((len << cluster_bits) > block_size)
- ntfs_sparse_cluster(inode, page, vcn, len);
- }
lbo = ((u64)lcn << cluster_bits) + off;
@@ -611,7 +618,6 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
}
} else if (vbo >= valid) {
/* Read out of valid data. */
- /* Should never be here 'cause already checked. */
clear_buffer_mapped(bh);
} else if (vbo + bytes <= valid) {
/* Normal read. */
@@ -953,6 +959,11 @@ int ntfs_write_end(struct file *file, struct address_space *mapping,
dirty = true;
}
+ if (pos + err > inode->i_size) {
+ inode->i_size = pos + err;
+ dirty = true;
+ }
+
if (dirty)
mark_inode_dirty(inode);
}
@@ -1162,6 +1173,18 @@ out:
return ERR_PTR(err);
}
+/*
+ * ntfs_create_inode
+ *
+ * Helper function for:
+ * - ntfs_create
+ * - ntfs_mknod
+ * - ntfs_symlink
+ * - ntfs_mkdir
+ * - ntfs_atomic_open
+ *
+ * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked
+ */
struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
const struct cpu_str *uni, umode_t mode,
@@ -1191,7 +1214,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
struct REPARSE_DATA_BUFFER *rp = NULL;
bool rp_inserted = false;
- ni_lock_dir(dir_ni);
+ if (!fnd)
+ ni_lock_dir(dir_ni);
dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL);
if (!dir_root) {
@@ -1254,6 +1278,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
fa = FILE_ATTRIBUTE_ARCHIVE;
}
+ /* If option "hide_dot_files" then set hidden attribute for dot files. */
+ if (sbi->options->hide_dot_files && name->name[0] == '.')
+ fa |= FILE_ATTRIBUTE_HIDDEN;
+
if (!(mode & 0222))
fa |= FILE_ATTRIBUTE_READONLY;
@@ -1339,6 +1367,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
mi_get_ref(&ni->mi, &new_de->ref);
fname = (struct ATTR_FILE_NAME *)(new_de + 1);
+
+ if (sbi->options->windows_names &&
+ !valid_windows_name(sbi, (struct le_str *)&fname->name_len)) {
+ err = -EINVAL;
+ goto out4;
+ }
+
mi_get_ref(&dir_ni->mi, &fname->home);
fname->dup.cr_time = fname->dup.m_time = fname->dup.c_time =
fname->dup.a_time = std5->cr_time;
@@ -1502,8 +1537,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
cpu_to_le64(ntfs_up_cluster(sbi, nsize));
err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0,
- clst, NULL, 0, &alen, 0,
- NULL);
+ clst, NULL, ALLOCATE_DEF,
+ &alen, 0, NULL, NULL);
if (err)
goto out5;
@@ -1550,7 +1585,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
goto out6;
/* Unlock parent directory before ntfs_init_acl. */
- ni_unlock(dir_ni);
+ if (!fnd)
+ ni_unlock(dir_ni);
inode->i_generation = le16_to_cpu(rec->seq);
@@ -1610,7 +1646,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
out7:
/* Undo 'indx_insert_entry'. */
- ni_lock_dir(dir_ni);
+ if (!fnd)
+ ni_lock_dir(dir_ni);
indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1,
le16_to_cpu(new_de->key_size), sbi);
/* ni_unlock(dir_ni); will be called later. */
@@ -1619,10 +1656,8 @@ out6:
ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref);
out5:
- if (S_ISDIR(mode) || run_is_empty(&ni->file.run))
- goto out4;
-
- run_deallocate(sbi, &ni->file.run, false);
+ if (!S_ISDIR(mode))
+ run_deallocate(sbi, &ni->file.run, false);
out4:
clear_rec_inuse(rec);
@@ -1638,7 +1673,8 @@ out2:
out1:
if (err) {
- ni_unlock(dir_ni);
+ if (!fnd)
+ ni_unlock(dir_ni);
return ERR_PTR(err);
}
@@ -1746,7 +1782,103 @@ void ntfs_evict_inode(struct inode *inode)
ni_clear(ntfs_i(inode));
}
-static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
+/*
+ * ntfs_translate_junction
+ *
+ * Translate a Windows junction target to the Linux equivalent.
+ * On junctions, targets are always absolute (they include the drive
+ * letter). We have no way of knowing if the target is for the current
+ * mounted device or not so we just assume it is.
+ */
+static int ntfs_translate_junction(const struct super_block *sb,
+ const struct dentry *link_de, char *target,
+ int target_len, int target_max)
+{
+ int tl_len, err = target_len;
+ char *link_path_buffer = NULL, *link_path;
+ char *translated = NULL;
+ char *target_start;
+ int copy_len;
+
+ link_path_buffer = kmalloc(PATH_MAX, GFP_NOFS);
+ if (!link_path_buffer) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /* Get link path, relative to mount point */
+ link_path = dentry_path_raw(link_de, link_path_buffer, PATH_MAX);
+ if (IS_ERR(link_path)) {
+ ntfs_err(sb, "Error getting link path");
+ err = -EINVAL;
+ goto out;
+ }
+
+ translated = kmalloc(PATH_MAX, GFP_NOFS);
+ if (!translated) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Make translated path a relative path to mount point */
+ strcpy(translated, "./");
+ ++link_path; /* Skip leading / */
+ for (tl_len = sizeof("./") - 1; *link_path; ++link_path) {
+ if (*link_path == '/') {
+ if (PATH_MAX - tl_len < sizeof("../")) {
+ ntfs_err(sb,
+ "Link path %s has too many components",
+ link_path);
+ err = -EINVAL;
+ goto out;
+ }
+ strcpy(translated + tl_len, "../");
+ tl_len += sizeof("../") - 1;
+ }
+ }
+
+ /* Skip drive letter */
+ target_start = target;
+ while (*target_start && *target_start != ':')
+ ++target_start;
+
+ if (!*target_start) {
+ ntfs_err(sb, "Link target (%s) missing drive separator",
+ target);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Skip drive separator and leading /, if exists */
+ target_start += 1 + (target_start[1] == '/');
+ copy_len = target_len - (target_start - target);
+
+ if (PATH_MAX - tl_len <= copy_len) {
+ ntfs_err(sb, "Link target %s too large for buffer (%d <= %d)",
+ target_start, PATH_MAX - tl_len, copy_len);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* translated path has a trailing / and target_start does not */
+ strcpy(translated + tl_len, target_start);
+ tl_len += copy_len;
+ if (target_max <= tl_len) {
+ ntfs_err(sb, "Target path %s too large for buffer (%d <= %d)",
+ translated, target_max, tl_len);
+ err = -EINVAL;
+ goto out;
+ }
+ strcpy(target, translated);
+ err = tl_len;
+
+out:
+ kfree(link_path_buffer);
+ kfree(translated);
+ return err;
+}
+
+static noinline int ntfs_readlink_hlp(const struct dentry *link_de,
+ struct inode *inode, char *buffer,
int buflen)
{
int i, err = -EINVAL;
@@ -1889,6 +2021,11 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
/* Always set last zero. */
buffer[err] = 0;
+
+ /* If this is a junction, translate the link target. */
+ if (rp->ReparseTag == IO_REPARSE_TAG_MOUNT_POINT)
+ err = ntfs_translate_junction(sb, link_de, buffer, err, buflen);
+
out:
kfree(to_free);
return err;
@@ -1907,7 +2044,7 @@ static const char *ntfs_get_link(struct dentry *de, struct inode *inode,
if (!ret)
return ERR_PTR(-ENOMEM);
- err = ntfs_readlink_hlp(inode, ret, PAGE_SIZE);
+ err = ntfs_readlink_hlp(de, inode, ret, PAGE_SIZE);
if (err < 0) {
kfree(ret);
return ERR_PTR(err);
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index 053cc0e0f8b5..c8db35e2ae17 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -7,6 +7,8 @@
#include <linux/fs.h>
#include <linux/nls.h>
+#include <linux/ctype.h>
+#include <linux/posix_acl.h>
#include "debug.h"
#include "ntfs.h"
@@ -303,6 +305,8 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir,
ni_lock_dir(dir_ni);
ni_lock(ni);
+ if (dir_ni != new_dir_ni)
+ ni_lock_dir2(new_dir_ni);
is_bad = false;
err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
@@ -326,6 +330,8 @@ static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir,
ntfs_sync_inode(inode);
}
+ if (dir_ni != new_dir_ni)
+ ni_unlock(new_dir_ni);
ni_unlock(ni);
ni_unlock(dir_ni);
out:
@@ -333,6 +339,104 @@ out:
return err;
}
+/*
+ * ntfs_atomic_open
+ *
+ * inode_operations::atomic_open
+ */
+static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
+ struct file *file, u32 flags, umode_t mode)
+{
+ int err;
+ struct inode *inode;
+ struct ntfs_fnd *fnd = NULL;
+ struct ntfs_inode *ni = ntfs_i(dir);
+ struct dentry *d = NULL;
+ struct cpu_str *uni = __getname();
+ bool locked = false;
+
+ if (!uni)
+ return -ENOMEM;
+
+ err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name,
+ dentry->d_name.len, uni, NTFS_NAME_LEN,
+ UTF16_HOST_ENDIAN);
+ if (err < 0)
+ goto out;
+
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ if (IS_POSIXACL(dir)) {
+ /*
+ * Load in cache current acl to avoid ni_lock(dir):
+ * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create ->
+ * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock
+ */
+ struct posix_acl *p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
+
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ goto out;
+ }
+ posix_acl_release(p);
+ }
+#endif
+
+ if (d_in_lookup(dentry)) {
+ ni_lock_dir(ni);
+ locked = true;
+ fnd = fnd_get();
+ if (!fnd) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ d = d_splice_alias(dir_search_u(dir, uni, fnd), dentry);
+ if (IS_ERR(d)) {
+ err = PTR_ERR(d);
+ d = NULL;
+ goto out2;
+ }
+
+ if (d)
+ dentry = d;
+ }
+
+ if (!(flags & O_CREAT) || d_really_is_positive(dentry)) {
+ err = finish_no_open(file, d);
+ goto out2;
+ }
+
+ file->f_mode |= FMODE_CREATED;
+
+ /*
+ * fnd contains tree's path to insert to.
+ * If fnd is not NULL then dir is locked.
+ */
+
+ /*
+ * Unfortunately I don't know how to get here correct 'struct nameidata *nd'
+ * or 'struct user_namespace *mnt_userns'.
+ * See atomic_open in fs/namei.c.
+ * This is why xfstest/633 failed.
+ * Looks like ntfs_atomic_open must accept 'struct user_namespace *mnt_userns' as argument.
+ */
+
+ inode = ntfs_create_inode(&init_user_ns, dir, dentry, uni, mode, 0,
+ NULL, 0, fnd);
+ err = IS_ERR(inode) ? PTR_ERR(inode)
+ : finish_open(file, dentry, ntfs_file_open);
+ dput(d);
+
+out2:
+ fnd_put(fnd);
+out1:
+ if (locked)
+ ni_unlock(ni);
+out:
+ __putname(uni);
+ return err;
+}
+
struct dentry *ntfs3_get_parent(struct dentry *child)
{
struct inode *inode = d_inode(child);
@@ -355,6 +459,133 @@ struct dentry *ntfs3_get_parent(struct dentry *child)
return ERR_PTR(-ENOENT);
}
+/*
+ * dentry_operations::d_hash
+ */
+static int ntfs_d_hash(const struct dentry *dentry, struct qstr *name)
+{
+ struct ntfs_sb_info *sbi;
+ const char *n = name->name;
+ unsigned int len = name->len;
+ unsigned long hash;
+ struct cpu_str *uni;
+ unsigned int c;
+ int err;
+
+ /* First try fast implementation. */
+ hash = init_name_hash(dentry);
+
+ for (;;) {
+ if (!len--) {
+ name->hash = end_name_hash(hash);
+ return 0;
+ }
+
+ c = *n++;
+ if (c >= 0x80)
+ break;
+
+ hash = partial_name_hash(toupper(c), hash);
+ }
+
+ /*
+ * Try slow way with current upcase table
+ */
+ uni = __getname();
+ if (!uni)
+ return -ENOMEM;
+
+ sbi = dentry->d_sb->s_fs_info;
+
+ err = ntfs_nls_to_utf16(sbi, name->name, name->len, uni, NTFS_NAME_LEN,
+ UTF16_HOST_ENDIAN);
+ if (err < 0)
+ goto out;
+
+ if (!err) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ hash = ntfs_names_hash(uni->name, uni->len, sbi->upcase,
+ init_name_hash(dentry));
+ name->hash = end_name_hash(hash);
+ err = 0;
+
+out:
+ __putname(uni);
+ return err;
+}
+
+/*
+ * dentry_operations::d_compare
+ */
+static int ntfs_d_compare(const struct dentry *dentry, unsigned int len1,
+ const char *str, const struct qstr *name)
+{
+ struct ntfs_sb_info *sbi;
+ int ret;
+ const char *n1 = str;
+ const char *n2 = name->name;
+ unsigned int len2 = name->len;
+ unsigned int lm = min(len1, len2);
+ unsigned char c1, c2;
+ struct cpu_str *uni1;
+ struct le_str *uni2;
+
+ /* First try fast implementation. */
+ for (;;) {
+ if (!lm--)
+ return len1 != len2;
+
+ if ((c1 = *n1++) == (c2 = *n2++))
+ continue;
+
+ if (c1 >= 0x80 || c2 >= 0x80)
+ break;
+
+ if (toupper(c1) != toupper(c2))
+ return 1;
+ }
+
+ /*
+ * Try slow way with current upcase table
+ */
+ sbi = dentry->d_sb->s_fs_info;
+ uni1 = __getname();
+ if (!uni1)
+ return -ENOMEM;
+
+ ret = ntfs_nls_to_utf16(sbi, str, len1, uni1, NTFS_NAME_LEN,
+ UTF16_HOST_ENDIAN);
+ if (ret < 0)
+ goto out;
+
+ if (!ret) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ uni2 = Add2Ptr(uni1, 2048);
+
+ ret = ntfs_nls_to_utf16(sbi, name->name, name->len,
+ (struct cpu_str *)uni2, NTFS_NAME_LEN,
+ UTF16_LITTLE_ENDIAN);
+ if (ret < 0)
+ goto out;
+
+ if (!ret) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = !ntfs_cmp_names_cpu(uni1, uni2, sbi->upcase, false) ? 0 : 1;
+
+out:
+ __putname(uni1);
+ return ret;
+}
+
// clang-format off
const struct inode_operations ntfs_dir_inode_operations = {
.lookup = ntfs_lookup,
@@ -372,6 +603,7 @@ const struct inode_operations ntfs_dir_inode_operations = {
.setattr = ntfs3_setattr,
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
+ .atomic_open = ntfs_atomic_open,
.fiemap = ntfs_fiemap,
};
@@ -382,4 +614,10 @@ const struct inode_operations ntfs_special_inode_operations = {
.get_inode_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
};
+
+const struct dentry_operations ntfs_dentry_ops = {
+ .d_hash = ntfs_d_hash,
+ .d_compare = ntfs_d_compare,
+};
+
// clang-format on
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 9cc396b117bf..86ea1826d099 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -84,7 +84,6 @@ typedef u32 CLST;
#define COMPRESSION_UNIT 4
#define COMPRESS_MAX_CLUSTER 0x1000
-#define MFT_INCREASE_CHUNK 1024
enum RECORD_NUM {
MFT_REC_MFT = 0,
@@ -715,12 +714,13 @@ static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr)
{
u32 de_off = le32_to_cpu(hdr->de_off);
u32 used = le32_to_cpu(hdr->used);
- struct NTFS_DE *e = Add2Ptr(hdr, de_off);
+ struct NTFS_DE *e;
u16 esize;
- if (de_off >= used || de_off >= le32_to_cpu(hdr->total))
+ if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used )
return NULL;
+ e = Add2Ptr(hdr, de_off);
esize = le16_to_cpu(e->size);
if (esize < sizeof(struct NTFS_DE) || de_off + esize > used)
return NULL;
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index a4d292809a33..0e051c5595a2 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -97,9 +97,12 @@ struct ntfs_mount_options {
unsigned sparse : 1; /* Create sparse files. */
unsigned showmeta : 1; /* Show meta files. */
unsigned nohidden : 1; /* Do not show hidden files. */
+ unsigned hide_dot_files : 1; /* Set hidden flag on dot files. */
+ unsigned windows_names : 1; /* Disallow names forbidden by Windows. */
unsigned force : 1; /* RW mount dirty volume. */
unsigned noacsrules : 1; /* Exclude acs rules. */
unsigned prealloc : 1; /* Preallocate space when file is growing. */
+ unsigned nocase : 1; /* case insensitive. */
};
/* Special value to unpack and deallocate. */
@@ -124,6 +127,7 @@ struct ntfs_buffers {
enum ALLOCATE_OPT {
ALLOCATE_DEF = 0, // Allocate all clusters.
ALLOCATE_MFT = 1, // Allocate for MFT.
+ ALLOCATE_ZERO = 2, // Zeroout new allocated clusters
};
enum bitmap_mutex_classes {
@@ -195,6 +199,8 @@ struct ntfs_index {
/* Minimum MFT zone. */
#define NTFS_MIN_MFT_ZONE 100
+/* Step to increase the MFT. */
+#define NTFS_MFT_INCREASE_STEP 1024
/* Ntfs file system in-core superblock data. */
struct ntfs_sb_info {
@@ -330,6 +336,7 @@ enum ntfs_inode_mutex_lock_class {
NTFS_INODE_MUTEX_REPARSE,
NTFS_INODE_MUTEX_NORMAL,
NTFS_INODE_MUTEX_PARENT,
+ NTFS_INODE_MUTEX_PARENT2,
};
/*
@@ -412,7 +419,7 @@ enum REPARSE_SIGN {
int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
- CLST *new_lcn);
+ CLST *new_lcn, CLST *new_len);
int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
struct ATTR_LIST_ENTRY *le, struct mft_inode *mi,
u64 new_size, struct runs_tree *run,
@@ -422,7 +429,7 @@ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
u64 new_size, const u64 *new_valid, bool keep_prealloc,
struct ATTRIB **ret);
int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
- CLST *len, bool *new);
+ CLST *len, bool *new, bool zero);
int attr_data_read_resident(struct ntfs_inode *ni, struct page *page);
int attr_data_write_resident(struct ntfs_inode *ni, struct page *page);
int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
@@ -469,9 +476,9 @@ static inline size_t al_aligned(size_t size)
}
/* Globals from bitfunc.c */
-bool are_bits_clear(const ulong *map, size_t bit, size_t nbits);
-bool are_bits_set(const ulong *map, size_t bit, size_t nbits);
-size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits);
+bool are_bits_clear(const void *map, size_t bit, size_t nbits);
+bool are_bits_set(const void *map, size_t bit, size_t nbits);
+size_t get_set_bits_ex(const void *map, size_t bit, size_t nbits);
/* Globals from dir.c */
int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
@@ -487,8 +494,6 @@ extern const struct file_operations ntfs_dir_operations;
/* Globals from file.c */
int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
struct kstat *stat, u32 request_mask, u32 flags);
-void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
- CLST len);
int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
struct iattr *attr);
int ntfs_file_open(struct inode *inode, struct file *file);
@@ -582,11 +587,10 @@ int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
bool simple);
int ntfs_extend_init(struct ntfs_sb_info *sbi);
int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi);
-const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi,
- enum ATTR_TYPE Type);
int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
CLST *new_lcn, CLST *new_len,
enum ALLOCATE_OPT opt);
+bool ntfs_check_for_free_space(struct ntfs_sb_info *sbi, CLST clen, CLST mlen);
int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft,
struct ntfs_inode *ni, struct mft_inode **mi);
void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft);
@@ -643,6 +647,7 @@ int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
const struct MFT_REF *ref);
void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim);
int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim);
+bool valid_windows_name(struct ntfs_sb_info *sbi, const struct le_str *name);
/* Globals from index.c */
int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit);
@@ -720,6 +725,7 @@ struct dentry *ntfs3_get_parent(struct dentry *child);
extern const struct inode_operations ntfs_dir_inode_operations;
extern const struct inode_operations ntfs_special_inode_operations;
+extern const struct dentry_operations ntfs_dentry_ops;
/* Globals from record.c */
int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi);
@@ -793,12 +799,12 @@ int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
u32 run_buf_size, CLST *packed_vcns);
int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
- u32 run_buf_size);
+ int run_buf_size);
#ifdef NTFS3_CHECK_FREE_CLST
int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
- u32 run_buf_size);
+ int run_buf_size);
#else
#define run_unpack_ex run_unpack
#endif
@@ -822,6 +828,8 @@ static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd)
int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits);
int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits);
int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits);
+int wnd_set_used_safe(struct wnd_bitmap *wnd, size_t bit, size_t bits,
+ size_t *done);
bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits);
bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits);
@@ -834,11 +842,17 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits);
void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len);
int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range);
+void ntfs_bitmap_set_le(void *map, unsigned int start, int len);
+void ntfs_bitmap_clear_le(void *map, unsigned int start, int len);
+unsigned int ntfs_bitmap_weight_le(const void *bitmap, int bits);
+
/* Globals from upcase.c */
int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2,
const u16 *upcase, bool bothcase);
int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
const u16 *upcase, bool bothcase);
+unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
+ unsigned long hash);
/* globals from xattr.c */
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
@@ -1113,6 +1127,11 @@ static inline void ni_lock_dir(struct ntfs_inode *ni)
mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT);
}
+static inline void ni_lock_dir2(struct ntfs_inode *ni)
+{
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT2);
+}
+
static inline void ni_unlock(struct ntfs_inode *ni)
{
mutex_unlock(&ni->ni_lock);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 7d2fac5ee215..defce6a5c8e1 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -220,6 +220,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
return NULL;
}
+ if (off + asize < off) {
+ /* overflow check */
+ return NULL;
+ }
+
attr = Add2Ptr(attr, asize);
off += asize;
}
@@ -260,6 +265,10 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
if (t16 + t32 > asize)
return NULL;
+ t32 = sizeof(short) * attr->name_len;
+ if (t32 && le16_to_cpu(attr->name_off) + t32 > t16)
+ return NULL;
+
return attr;
}
@@ -537,6 +546,10 @@ bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes)
return true;
}
+/*
+ * Pack runs in MFT record.
+ * If failed record is not changed.
+ */
int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr,
struct runs_tree *run, CLST len)
{
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
index aaaa0d3d35a2..a5af71cd8d14 100644
--- a/fs/ntfs3/run.c
+++ b/fs/ntfs3/run.c
@@ -919,12 +919,15 @@ out:
*/
int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
- u32 run_buf_size)
+ int run_buf_size)
{
u64 prev_lcn, vcn64, lcn, next_vcn;
const u8 *run_last, *run_0;
bool is_mft = ino == MFT_REC_MFT;
+ if (run_buf_size < 0)
+ return -EINVAL;
+
/* Check for empty. */
if (evcn + 1 == svcn)
return 0;
@@ -1046,7 +1049,7 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
*/
int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
- u32 run_buf_size)
+ int run_buf_size)
{
int ret, err;
CLST next_vcn, lcn, len;
@@ -1093,25 +1096,8 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
if (down_write_trylock(&wnd->rw_lock)) {
/* Mark all zero bits as used in range [lcn, lcn+len). */
- CLST i, lcn_f = 0, len_f = 0;
-
- err = 0;
- for (i = 0; i < len; i++) {
- if (wnd_is_free(wnd, lcn + i, 1)) {
- if (!len_f)
- lcn_f = lcn + i;
- len_f += 1;
- } else if (len_f) {
- err = wnd_set_used(wnd, lcn_f, len_f);
- len_f = 0;
- if (err)
- break;
- }
- }
-
- if (len_f)
- err = wnd_set_used(wnd, lcn_f, len_f);
-
+ size_t done;
+ err = wnd_set_used_safe(wnd, lcn, len, &done);
up_write(&wnd->rw_lock);
if (err)
return err;
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 47012c9bf505..ef4ea3f21905 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -21,6 +21,30 @@
* https://docs.microsoft.com/en-us/windows/wsl/file-permissions
* It stores uid/gid/mode/dev in xattr
*
+ * ntfs allows up to 2^64 clusters per volume.
+ * It means you should use 64 bits lcn to operate with ntfs.
+ * Implementation of ntfs.sys uses only 32 bits lcn.
+ * Default ntfs3 uses 32 bits lcn too.
+ * ntfs3 built with CONFIG_NTFS3_64BIT_CLUSTER (ntfs3_64) uses 64 bits per lcn.
+ *
+ *
+ * ntfs limits, cluster size is 4K (2^12)
+ * -----------------------------------------------------------------------------
+ * | Volume size | Clusters | ntfs.sys | ntfs3 | ntfs3_64 | mkntfs | chkdsk |
+ * -----------------------------------------------------------------------------
+ * | < 16T, 2^44 | < 2^32 | yes | yes | yes | yes | yes |
+ * | > 16T, 2^44 | > 2^32 | no | no | yes | yes | yes |
+ * ----------------------------------------------------------|------------------
+ *
+ * To mount large volumes as ntfs one should use large cluster size (up to 2M)
+ * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P
+ *
+ * ntfs limits, cluster size is 2M (2^31)
+ * -----------------------------------------------------------------------------
+ * | < 8P, 2^54 | < 2^32 | yes | yes | yes | yes | yes |
+ * | > 8P, 2^54 | > 2^32 | no | no | yes | yes | yes |
+ * ----------------------------------------------------------|------------------
+ *
*/
#include <linux/blkdev.h>
@@ -223,11 +247,14 @@ enum Opt {
Opt_force,
Opt_sparse,
Opt_nohidden,
+ Opt_hide_dot_files,
+ Opt_windows_names,
Opt_showmeta,
Opt_acl,
Opt_iocharset,
Opt_prealloc,
Opt_noacsrules,
+ Opt_nocase,
Opt_err,
};
@@ -242,10 +269,13 @@ static const struct fs_parameter_spec ntfs_fs_parameters[] = {
fsparam_flag_no("force", Opt_force),
fsparam_flag_no("sparse", Opt_sparse),
fsparam_flag_no("hidden", Opt_nohidden),
+ fsparam_flag_no("hide_dot_files", Opt_hide_dot_files),
+ fsparam_flag_no("windows_names", Opt_windows_names),
fsparam_flag_no("acl", Opt_acl),
fsparam_flag_no("showmeta", Opt_showmeta),
fsparam_flag_no("prealloc", Opt_prealloc),
fsparam_flag_no("acsrules", Opt_noacsrules),
+ fsparam_flag_no("nocase", Opt_nocase),
fsparam_string("iocharset", Opt_iocharset),
{}
};
@@ -330,6 +360,12 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
case Opt_nohidden:
opts->nohidden = result.negated ? 1 : 0;
break;
+ case Opt_hide_dot_files:
+ opts->hide_dot_files = result.negated ? 0 : 1;
+ break;
+ case Opt_windows_names:
+ opts->windows_names = result.negated ? 0 : 1;
+ break;
case Opt_acl:
if (!result.negated)
#ifdef CONFIG_NTFS3_FS_POSIX_ACL
@@ -354,6 +390,9 @@ static int ntfs_fs_parse_param(struct fs_context *fc,
case Opt_noacsrules:
opts->noacsrules = result.negated ? 1 : 0;
break;
+ case Opt_nocase:
+ opts->nocase = result.negated ? 1 : 0;
+ break;
default:
/* Should not be here unless we forget add case. */
return -EINVAL;
@@ -406,27 +445,18 @@ static struct inode *ntfs_alloc_inode(struct super_block *sb)
return NULL;
memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode));
-
mutex_init(&ni->ni_lock);
-
return &ni->vfs_inode;
}
-static void ntfs_i_callback(struct rcu_head *head)
+static void ntfs_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
struct ntfs_inode *ni = ntfs_i(inode);
mutex_destroy(&ni->ni_lock);
-
kmem_cache_free(ntfs_inode_cachep, ni);
}
-static void ntfs_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, ntfs_i_callback);
-}
-
static void init_once(void *foo)
{
struct ntfs_inode *ni = foo;
@@ -519,9 +549,9 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
seq_printf(m, ",gid=%u",
from_kgid_munged(user_ns, opts->fs_gid));
if (opts->fmask)
- seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv);
+ seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff);
if (opts->dmask)
- seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv);
+ seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff);
if (opts->nls)
seq_printf(m, ",iocharset=%s", opts->nls->charset);
else
@@ -536,6 +566,10 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",showmeta");
if (opts->nohidden)
seq_puts(m, ",nohidden");
+ if (opts->windows_names)
+ seq_puts(m, ",windows_names");
+ if (opts->hide_dot_files)
+ seq_puts(m, ",hide_dot_files");
if (opts->force)
seq_puts(m, ",force");
if (opts->noacsrules)
@@ -592,7 +626,7 @@ static int ntfs_sync_fs(struct super_block *sb, int wait)
static const struct super_operations ntfs_sops = {
.alloc_inode = ntfs_alloc_inode,
- .destroy_inode = ntfs_destroy_inode,
+ .free_inode = ntfs_free_inode,
.evict_inode = ntfs_evict_inode,
.put_super = ntfs_put_super,
.statfs = ntfs_statfs,
@@ -672,7 +706,7 @@ static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
if (boot->sectors_per_clusters <= 0x80)
return boot->sectors_per_clusters;
if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */
- return 1U << (0 - boot->sectors_per_clusters);
+ return 1U << -(s8)boot->sectors_per_clusters;
return -EINVAL;
}
@@ -789,7 +823,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
: (u32)boot->record_size
<< sbi->cluster_bits;
- if (record_size > MAXIMUM_BYTES_PER_MFT)
+ if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE)
goto out;
sbi->record_bits = blksize_bits(record_size);
@@ -896,7 +930,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
struct block_device *bdev = sb->s_bdev;
struct inode *inode;
struct ntfs_inode *ni;
- size_t i, tt;
+ size_t i, tt, bad_len, bad_frags;
CLST vcn, lcn, len;
struct ATTRIB *attr;
const struct VOLUME_INFO *info;
@@ -916,6 +950,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_export_op = &ntfs_export_ops;
sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
sb->s_xattr = ntfs_xattr_handlers;
+ sb->s_d_op = sbi->options->nocase ? &ntfs_dentry_ops : NULL;
sbi->options->nls = ntfs_load_nls(sbi->options->nls_name);
if (IS_ERR(sbi->options->nls)) {
@@ -1065,30 +1100,6 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
sbi->mft.ni = ni;
- /* Load $BadClus. */
- ref.low = cpu_to_le32(MFT_REC_BADCLUST);
- ref.seq = cpu_to_le16(MFT_REC_BADCLUST);
- inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS);
- if (IS_ERR(inode)) {
- ntfs_err(sb, "Failed to load $BadClus.");
- err = PTR_ERR(inode);
- goto out;
- }
-
- ni = ntfs_i(inode);
-
- for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) {
- if (lcn == SPARSE_LCN)
- continue;
-
- if (!sbi->bad_clusters)
- ntfs_notice(sb, "Volume contains bad blocks");
-
- sbi->bad_clusters += len;
- }
-
- iput(inode);
-
/* Load $Bitmap. */
ref.low = cpu_to_le32(MFT_REC_BITMAP);
ref.seq = cpu_to_le16(MFT_REC_BITMAP);
@@ -1126,6 +1137,44 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
goto out;
+ /* Load $BadClus. */
+ ref.low = cpu_to_le32(MFT_REC_BADCLUST);
+ ref.seq = cpu_to_le16(MFT_REC_BADCLUST);
+ inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $BadClus (%d).", err);
+ goto out;
+ }
+
+ ni = ntfs_i(inode);
+ bad_len = bad_frags = 0;
+ for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) {
+ if (lcn == SPARSE_LCN)
+ continue;
+
+ bad_len += len;
+ bad_frags += 1;
+ if (sb_rdonly(sb))
+ continue;
+
+ if (wnd_set_used_safe(&sbi->used.bitmap, lcn, len, &tt) || tt) {
+ /* Bad blocks marked as free in bitmap. */
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ }
+ }
+ if (bad_len) {
+ /*
+ * Notice about bad blocks.
+ * In normal cases these blocks are marked as used in bitmap.
+ * And we never allocate space in it.
+ */
+ ntfs_notice(sb,
+ "Volume contains %zu bad blocks in %zu fragments.",
+ bad_len, bad_frags);
+ }
+ iput(inode);
+
/* Load $AttrDef. */
ref.low = cpu_to_le32(MFT_REC_ATTR);
ref.seq = cpu_to_le16(MFT_REC_ATTR);
@@ -1141,7 +1190,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
goto put_inode_out;
}
bytes = inode->i_size;
- sbi->def_table = t = kmalloc(bytes, GFP_NOFS);
+ sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN);
if (!t) {
err = -ENOMEM;
goto put_inode_out;
@@ -1260,9 +1309,9 @@ load_root:
ref.low = cpu_to_le32(MFT_REC_ROOT);
ref.seq = cpu_to_le16(MFT_REC_ROOT);
inode = ntfs_iget5(sb, &ref, &NAME_ROOT);
- if (IS_ERR(inode)) {
+ if (IS_ERR(inode) || !inode->i_op) {
ntfs_err(sb, "Failed to load root.");
- err = PTR_ERR(inode);
+ err = IS_ERR(inode) ? PTR_ERR(inode) : -EINVAL;
goto out;
}
@@ -1281,6 +1330,7 @@ out:
* Free resources here.
* ntfs_fs_free will be called with fc->s_fs_info = NULL
*/
+ put_mount_options(sbi->options);
put_ntfs(sbi);
sb->s_fs_info = NULL;
@@ -1488,11 +1538,8 @@ out1:
static void __exit exit_ntfs_fs(void)
{
- if (ntfs_inode_cachep) {
- rcu_barrier();
- kmem_cache_destroy(ntfs_inode_cachep);
- }
-
+ rcu_barrier();
+ kmem_cache_destroy(ntfs_inode_cachep);
unregister_filesystem(&ntfs_fs_type);
ntfs3_exit_bitmap();
}
diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c
index b5e8256fd710..7681eefacb4b 100644
--- a/fs/ntfs3/upcase.c
+++ b/fs/ntfs3/upcase.c
@@ -102,3 +102,15 @@ case_insentive:
diff2 = l1 - l2;
return diff2 ? diff2 : diff1;
}
+
+/* Helper function for ntfs_d_hash. */
+unsigned long ntfs_names_hash(const u16 *name, size_t len, const u16 *upcase,
+ unsigned long hash)
+{
+ while (len--) {
+ unsigned int c = upcase_unicode_char(upcase, *name++);
+ hash = partial_name_hash(c, hash);
+ }
+
+ return hash;
+}
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index aafe98ee0b21..616df209feea 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -15,9 +15,10 @@
#include "ntfs_fs.h"
// clang-format off
-#define SYSTEM_DOS_ATTRIB "system.dos_attrib"
-#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib"
-#define SYSTEM_NTFS_SECURITY "system.ntfs_security"
+#define SYSTEM_DOS_ATTRIB "system.dos_attrib"
+#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib"
+#define SYSTEM_NTFS_ATTRIB_BE "system.ntfs_attrib_be"
+#define SYSTEM_NTFS_SECURITY "system.ntfs_security"
// clang-format on
static inline size_t unpacked_ea_size(const struct EA_FULL *ea)
@@ -42,28 +43,26 @@ static inline size_t packed_ea_size(const struct EA_FULL *ea)
* Assume there is at least one xattr in the list.
*/
static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes,
- const char *name, u8 name_len, u32 *off)
+ const char *name, u8 name_len, u32 *off, u32 *ea_sz)
{
- *off = 0;
+ u32 ea_size;
- if (!ea_all || !bytes)
+ *off = 0;
+ if (!ea_all)
return false;
- for (;;) {
+ for (; *off < bytes; *off += ea_size) {
const struct EA_FULL *ea = Add2Ptr(ea_all, *off);
- u32 next_off = *off + unpacked_ea_size(ea);
-
- if (next_off > bytes)
- return false;
-
+ ea_size = unpacked_ea_size(ea);
if (ea->name_len == name_len &&
- !memcmp(ea->name, name, name_len))
+ !memcmp(ea->name, name, name_len)) {
+ if (ea_sz)
+ *ea_sz = ea_size;
return true;
-
- *off = next_off;
- if (next_off >= bytes)
- return false;
+ }
}
+
+ return false;
}
/*
@@ -74,12 +73,12 @@ static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes,
static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
size_t add_bytes, const struct EA_INFO **info)
{
- int err;
+ int err = -EINVAL;
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ATTR_LIST_ENTRY *le = NULL;
struct ATTRIB *attr_info, *attr_ea;
void *ea_p;
- u32 size;
+ u32 size, off, ea_size;
static_assert(le32_to_cpu(ATTR_EA_INFO) < le32_to_cpu(ATTR_EA));
@@ -96,24 +95,31 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
*info = resident_data_ex(attr_info, sizeof(struct EA_INFO));
if (!*info)
- return -EINVAL;
+ goto out;
/* Check Ea limit. */
size = le32_to_cpu((*info)->size);
- if (size > sbi->ea_max_size)
- return -EFBIG;
+ if (size > sbi->ea_max_size) {
+ err = -EFBIG;
+ goto out;
+ }
- if (attr_size(attr_ea) > sbi->ea_max_size)
- return -EFBIG;
+ if (attr_size(attr_ea) > sbi->ea_max_size) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ if (!size) {
+ /* EA info persists, but xattr is empty. Looks like EA problem. */
+ goto out;
+ }
/* Allocate memory for packed Ea. */
- ea_p = kmalloc(size + add_bytes, GFP_NOFS);
+ ea_p = kmalloc(size_add(size, add_bytes), GFP_NOFS);
if (!ea_p)
return -ENOMEM;
- if (!size) {
- /* EA info persists, but xattr is empty. Looks like EA problem. */
- } else if (attr_ea->non_res) {
+ if (attr_ea->non_res) {
struct runs_tree run;
run_init(&run);
@@ -124,24 +130,52 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
run_close(&run);
if (err)
- goto out;
+ goto out1;
} else {
void *p = resident_data_ex(attr_ea, size);
- if (!p) {
- err = -EINVAL;
- goto out;
- }
+ if (!p)
+ goto out1;
memcpy(ea_p, p, size);
}
memset(Add2Ptr(ea_p, size), 0, add_bytes);
+
+ /* Check all attributes for consistency. */
+ for (off = 0; off < size; off += ea_size) {
+ const struct EA_FULL *ef = Add2Ptr(ea_p, off);
+ u32 bytes = size - off;
+
+ /* Check if we can use field ea->size. */
+ if (bytes < sizeof(ef->size))
+ goto out1;
+
+ if (ef->size) {
+ ea_size = le32_to_cpu(ef->size);
+ if (ea_size > bytes)
+ goto out1;
+ continue;
+ }
+
+ /* Check if we can use fields ef->name_len and ef->elength. */
+ if (bytes < offsetof(struct EA_FULL, name))
+ goto out1;
+
+ ea_size = ALIGN(struct_size(ef, name,
+ 1 + ef->name_len +
+ le16_to_cpu(ef->elength)),
+ 4);
+ if (ea_size > bytes)
+ goto out1;
+ }
+
*ea = ea_p;
return 0;
-out:
+out1:
kfree(ea_p);
- *ea = NULL;
+out:
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
return err;
}
@@ -163,6 +197,7 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
const struct EA_FULL *ea;
u32 off, size;
int err;
+ int ea_size;
size_t ret;
err = ntfs_read_ea(ni, &ea_all, 0, &info);
@@ -175,8 +210,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
size = le32_to_cpu(info->size);
/* Enumerate all xattrs. */
- for (ret = 0, off = 0; off < size; off += unpacked_ea_size(ea)) {
+ for (ret = 0, off = 0; off < size; off += ea_size) {
ea = Add2Ptr(ea_all, off);
+ ea_size = unpacked_ea_size(ea);
if (buffer) {
if (ret + ea->name_len + 1 > bytes_per_buffer) {
@@ -227,7 +263,8 @@ static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len,
goto out;
/* Enumerate all xattrs. */
- if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off)) {
+ if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off,
+ NULL)) {
err = -ENODATA;
goto out;
}
@@ -269,7 +306,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
struct EA_FULL *new_ea;
struct EA_FULL *ea_all = NULL;
size_t add, new_pack;
- u32 off, size;
+ u32 off, size, ea_sz;
__le16 size_pack;
struct ATTRIB *attr;
struct ATTR_LIST_ENTRY *le;
@@ -304,9 +341,8 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_pack = ea_info.size_pack;
}
- if (info && find_ea(ea_all, size, name, name_len, &off)) {
+ if (info && find_ea(ea_all, size, name, name_len, &off, &ea_sz)) {
struct EA_FULL *ea;
- size_t ea_sz;
if (flags & XATTR_CREATE) {
err = -EEXIST;
@@ -329,8 +365,6 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name,
if (ea->flags & FILE_NEED_EA)
le16_add_cpu(&ea_info.count, -1);
- ea_sz = unpacked_ea_size(ea);
-
le16_add_cpu(&ea_info.size_pack, 0 - packed_ea_size(ea));
memmove(ea, Add2Ptr(ea, ea_sz), size - off - ea_sz);
@@ -604,10 +638,9 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
err = 0; /* Removing non existed xattr. */
if (!err) {
set_cached_acl(inode, type, acl);
- if (inode->i_mode != mode) {
- inode->i_mode = mode;
- mark_inode_dirty(inode);
- }
+ inode->i_mode = mode;
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
}
out:
@@ -721,11 +754,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
{
int err;
struct ntfs_inode *ni = ntfs_i(inode);
- size_t name_len = strlen(name);
/* Dispatch request. */
- if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 &&
- !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) {
+ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) {
/* system.dos_attrib */
if (!buffer) {
err = sizeof(u8);
@@ -738,8 +769,8 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
goto out;
}
- if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 &&
- !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) {
+ if (!strcmp(name, SYSTEM_NTFS_ATTRIB) ||
+ !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) {
/* system.ntfs_attrib */
if (!buffer) {
err = sizeof(u32);
@@ -748,12 +779,13 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
} else {
err = sizeof(u32);
*(u32 *)buffer = le32_to_cpu(ni->std_fa);
+ if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE))
+ *(u32 *)buffer = cpu_to_be32(*(u32 *)buffer);
}
goto out;
}
- if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 &&
- !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) {
+ if (!strcmp(name, SYSTEM_NTFS_SECURITY)) {
/* system.ntfs_security*/
struct SECURITY_DESCRIPTOR_RELATIVE *sd = NULL;
size_t sd_size = 0;
@@ -793,7 +825,7 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
}
/* Deal with NTFS extended attribute. */
- err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL);
+ err = ntfs_get_ea(inode, name, strlen(name), buffer, size, NULL);
out:
return err;
@@ -810,23 +842,24 @@ static noinline int ntfs_setxattr(const struct xattr_handler *handler,
{
int err = -EINVAL;
struct ntfs_inode *ni = ntfs_i(inode);
- size_t name_len = strlen(name);
enum FILE_ATTRIBUTE new_fa;
/* Dispatch request. */
- if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 &&
- !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) {
+ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) {
if (sizeof(u8) != size)
goto out;
new_fa = cpu_to_le32(*(u8 *)value);
goto set_new_fa;
}
- if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 &&
- !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) {
+ if (!strcmp(name, SYSTEM_NTFS_ATTRIB) ||
+ !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) {
if (size != sizeof(u32))
goto out;
- new_fa = cpu_to_le32(*(u32 *)value);
+ if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE))
+ new_fa = cpu_to_le32(be32_to_cpu(*(u32 *)value));
+ else
+ new_fa = cpu_to_le32(*(u32 *)value);
if (S_ISREG(inode->i_mode)) {
/* Process compressed/sparsed in special way. */
@@ -861,8 +894,7 @@ set_new_fa:
goto out;
}
- if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 &&
- !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) {
+ if (!strcmp(name, SYSTEM_NTFS_SECURITY)) {
/* system.ntfs_security*/
__le32 security_id;
bool inserted;
@@ -905,7 +937,7 @@ set_new_fa:
}
/* Deal with NTFS extended attribute. */
- err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
+ err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0);
out:
inode->i_ctime = current_time(inode);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 37d222bdfc8c..a07b24d170f2 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1602,6 +1602,7 @@ static void o2net_start_connect(struct work_struct *work)
sc->sc_sock = sock; /* freed by sc_kref_release */
sock->sk->sk_allocation = GFP_ATOMIC;
+ sock->sk->sk_use_task_frag = false;
myaddr.sin_family = AF_INET;
myaddr.sin_addr.s_addr = mynode->nd_ipv4_address;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 732661aa2680..167fa43b24f9 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -273,7 +273,6 @@ out:
gossip_debug(GOSSIP_FILE_DEBUG,
"%s(%pU): PUT buffer_index %d\n",
__func__, handle, buffer_index);
- buffer_index = -1;
}
op_release(new_op);
return ret;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 370bd3bbf5e4..4df560894386 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -530,7 +530,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
size_t count = iov_iter_count(iter);
ssize_t total_count = 0;
ssize_t ret = -EINVAL;
- int i = 0;
gossip_debug(GOSSIP_FILE_DEBUG,
"%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
@@ -556,7 +555,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
while (iov_iter_count(iter)) {
size_t each_count = iov_iter_count(iter);
size_t amt_complete;
- i++;
/* how much to transfer in this loop iteration */
if (each_count > orangefs_bufmap_size_query())
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 29eaa4544372..1b508f543384 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -194,15 +194,10 @@ void orangefs_debugfs_init(int debug_mask)
*/
static void orangefs_kernel_debug_init(void)
{
- int rc = -ENOMEM;
- char *k_buffer = NULL;
+ static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!k_buffer)
- goto out;
-
if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcpy(k_buffer, kernel_debug_string);
strcat(k_buffer, "\n");
@@ -213,15 +208,14 @@ static void orangefs_kernel_debug_init(void)
debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
&kernel_debug_fops);
-
-out:
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
}
void orangefs_debugfs_cleanup(void)
{
debugfs_remove_recursive(debug_dir);
+ kfree(debug_help_string);
+ debug_help_string = NULL;
}
/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
@@ -297,18 +291,13 @@ static int help_show(struct seq_file *m, void *v)
/*
* initialize the client-debug file.
*/
-static int orangefs_client_debug_init(void)
+static void orangefs_client_debug_init(void)
{
- int rc = -ENOMEM;
- char *c_buffer = NULL;
+ static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!c_buffer)
- goto out;
-
if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcpy(c_buffer, client_debug_string);
strcat(c_buffer, "\n");
@@ -322,13 +311,6 @@ static int orangefs_client_debug_init(void)
debug_dir,
c_buffer,
&kernel_debug_fops);
-
- rc = 0;
-
-out:
-
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
- return rc;
}
/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
@@ -671,6 +653,7 @@ int orangefs_prepare_debugfs_help_string(int at_boot)
memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE);
strlcat(debug_help_string, new, string_size);
mutex_unlock(&orangefs_help_file_lock);
+ kfree(new);
}
rc = 0;
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index cd7297815f91..5ab741c60b7e 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -141,7 +141,7 @@ static int __init orangefs_init(void)
gossip_err("%s: could not initialize device subsystem %d!\n",
__func__,
ret);
- goto cleanup_device;
+ goto cleanup_sysfs;
}
ret = register_filesystem(&orangefs_fs_type);
@@ -152,11 +152,11 @@ static int __init orangefs_init(void)
goto out;
}
- orangefs_sysfs_exit();
-
-cleanup_device:
orangefs_dev_cleanup();
+cleanup_sysfs:
+ orangefs_sysfs_exit();
+
sysfs_init_failed:
orangefs_debugfs_cleanup();
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index de80b62553bb..be4ba03a01a0 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -896,9 +896,18 @@ static struct attribute *orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(orangefs_default);
+static struct kobject *orangefs_obj;
+
+static void orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(orangefs_obj);
+ orangefs_obj = NULL;
+}
+
static struct kobj_type orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = orangefs_default_groups,
+ .release = orangefs_obj_release,
};
static struct orangefs_attribute acache_hard_limit_attribute =
@@ -934,9 +943,18 @@ static struct attribute *acache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(acache_orangefs_default);
+static struct kobject *acache_orangefs_obj;
+
+static void acache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(acache_orangefs_obj);
+ acache_orangefs_obj = NULL;
+}
+
static struct kobj_type acache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = acache_orangefs_default_groups,
+ .release = acache_orangefs_obj_release,
};
static struct orangefs_attribute capcache_hard_limit_attribute =
@@ -972,9 +990,18 @@ static struct attribute *capcache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(capcache_orangefs_default);
+static struct kobject *capcache_orangefs_obj;
+
+static void capcache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(capcache_orangefs_obj);
+ capcache_orangefs_obj = NULL;
+}
+
static struct kobj_type capcache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = capcache_orangefs_default_groups,
+ .release = capcache_orangefs_obj_release,
};
static struct orangefs_attribute ccache_hard_limit_attribute =
@@ -1010,9 +1037,18 @@ static struct attribute *ccache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(ccache_orangefs_default);
+static struct kobject *ccache_orangefs_obj;
+
+static void ccache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(ccache_orangefs_obj);
+ ccache_orangefs_obj = NULL;
+}
+
static struct kobj_type ccache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = ccache_orangefs_default_groups,
+ .release = ccache_orangefs_obj_release,
};
static struct orangefs_attribute ncache_hard_limit_attribute =
@@ -1048,9 +1084,18 @@ static struct attribute *ncache_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(ncache_orangefs_default);
+static struct kobject *ncache_orangefs_obj;
+
+static void ncache_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(ncache_orangefs_obj);
+ ncache_orangefs_obj = NULL;
+}
+
static struct kobj_type ncache_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = ncache_orangefs_default_groups,
+ .release = ncache_orangefs_obj_release,
};
static struct orangefs_attribute pc_acache_attribute =
@@ -1079,9 +1124,18 @@ static struct attribute *pc_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(pc_orangefs_default);
+static struct kobject *pc_orangefs_obj;
+
+static void pc_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(pc_orangefs_obj);
+ pc_orangefs_obj = NULL;
+}
+
static struct kobj_type pc_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = pc_orangefs_default_groups,
+ .release = pc_orangefs_obj_release,
};
static struct orangefs_attribute stats_reads_attribute =
@@ -1103,19 +1157,20 @@ static struct attribute *stats_orangefs_default_attrs[] = {
};
ATTRIBUTE_GROUPS(stats_orangefs_default);
+static struct kobject *stats_orangefs_obj;
+
+static void stats_orangefs_obj_release(struct kobject *kobj)
+{
+ kfree(stats_orangefs_obj);
+ stats_orangefs_obj = NULL;
+}
+
static struct kobj_type stats_orangefs_ktype = {
.sysfs_ops = &orangefs_sysfs_ops,
.default_groups = stats_orangefs_default_groups,
+ .release = stats_orangefs_obj_release,
};
-static struct kobject *orangefs_obj;
-static struct kobject *acache_orangefs_obj;
-static struct kobject *capcache_orangefs_obj;
-static struct kobject *ccache_orangefs_obj;
-static struct kobject *ncache_orangefs_obj;
-static struct kobject *pc_orangefs_obj;
-static struct kobject *stats_orangefs_obj;
-
int orangefs_sysfs_init(void)
{
int rc = -EINVAL;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 6e4e65ee050d..c14e90764e35 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -792,7 +792,7 @@ static int ovl_copy_up_tmpfile(struct ovl_copy_up_ctx *c)
if (!c->metacopy && c->stat.size) {
err = ovl_copy_up_file(ofs, c->dentry, tmpfile, c->stat.size);
if (err)
- return err;
+ goto out_fput;
}
err = ovl_copy_up_metadata(c, temp);
@@ -1011,6 +1011,10 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
if (err)
return err;
+ if (!kuid_has_mapping(current_user_ns(), ctx.stat.uid) ||
+ !kgid_has_mapping(current_user_ns(), ctx.stat.gid))
+ return -EOVERFLOW;
+
ctx.metacopy = ovl_need_meta_copy_up(dentry, ctx.stat.mode, flags);
if (parent) {
diff --git a/fs/pnode.c b/fs/pnode.c
index 1106137c747a..468e4e65a615 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -244,7 +244,7 @@ static int propagate_one(struct mount *m)
}
do {
struct mount *parent = last_source->mnt_parent;
- if (last_source == first_source)
+ if (peers(last_source, first_source))
break;
done = parent->mnt_master == p;
if (done && peers(n, parent))
diff --git a/fs/proc/page.c b/fs/proc/page.c
index f2273b164535..6249c347809a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -219,8 +219,9 @@ u64 stable_page_flags(struct page *page)
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_ARCH_USES_PG_ARCH_X
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+ u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
return u;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 8adabde685f1..c49d554cc9ae 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -126,6 +126,7 @@ config PSTORE_CONSOLE
config PSTORE_PMSG
bool "Log user space messages"
depends on PSTORE
+ select RT_MUTEXES
help
When the option is enabled, pstore will export a character
interface /dev/pmsg0 to log user space messages. On reboot
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index d8542ec2f38c..ab82e5f05346 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -7,9 +7,10 @@
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
+#include <linux/rtmutex.h>
#include "internal.h"
-static DEFINE_MUTEX(pmsg_lock);
+static DEFINE_RT_MUTEX(pmsg_lock);
static ssize_t write_pmsg(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
@@ -28,9 +29,9 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
if (!access_ok(buf, count))
return -EFAULT;
- mutex_lock(&pmsg_lock);
+ rt_mutex_lock(&pmsg_lock);
ret = psinfo->write_user(&record, buf);
- mutex_unlock(&pmsg_lock);
+ rt_mutex_unlock(&pmsg_lock);
return ret ? ret : count;
}
@@ -46,7 +47,7 @@ static int pmsg_major;
#undef pr_fmt
#define pr_fmt(fmt) PMSG_NAME ": " fmt
-static char *pmsg_devnode(struct device *dev, umode_t *mode)
+static char *pmsg_devnode(const struct device *dev, umode_t *mode)
{
if (mode)
*mode = 0220;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 9a5052431fd3..ade66dbe5f39 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -680,7 +680,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
field = value; \
}
- parse_u32("mem-type", pdata->record_size, pdata->mem_type);
+ parse_u32("mem-type", pdata->mem_type, pdata->mem_type);
parse_u32("record-size", pdata->record_size, 0);
parse_u32("console-size", pdata->console_size, 0);
parse_u32("ftrace-size", pdata->ftrace_size, 0);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 1d7c2a812fc1..34e416327dd4 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -595,7 +595,7 @@ static void udf_do_extend_final_block(struct inode *inode,
*/
if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK))
return;
- added_bytes = (last_ext->extLength & UDF_EXTENT_LENGTH_MASK) - new_elen;
+ added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
last_ext->extLength += added_bytes;
UDF_I(inode)->i_lenExtents += added_bytes;
@@ -684,7 +684,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
struct kernel_lb_addr eloc, tmpeloc;
int c = 1;
loff_t lbcount = 0, b_off = 0;
- udf_pblk_t newblocknum, newblock;
+ udf_pblk_t newblocknum, newblock = 0;
sector_t offset = 0;
int8_t etype;
struct udf_inode_info *iinfo = UDF_I(inode);
@@ -787,7 +787,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len);
if (ret < 0) {
*err = ret;
- newblock = 0;
goto out_free;
}
c = 0;
@@ -852,7 +851,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
goal, err);
if (!newblocknum) {
*err = -ENOSPC;
- newblock = 0;
goto out_free;
}
if (isBeyondEOF)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 98ac37e34e3d..cc694846617a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
+
+ vma->vm_flags = flags;
+ /*
+ * For shared mappings, we want to enable writenotify while
+ * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
+ * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
+ */
+ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
+ vma_set_page_prot(vma);
+}
+
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
@@ -618,7 +633,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
for_each_vma(vmi, vma) {
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma,
+ vma->vm_flags & ~__VM_UFFD_FLAGS);
}
}
mmap_write_unlock(mm);
@@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
octx = vma->vm_userfaultfd_ctx.ctx;
if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
return 0;
}
@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
} else {
/* Drop uffd context if remap feature not enabled */
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
- vma->vm_flags &= ~__VM_UFFD_FLAGS;
+ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
}
}
@@ -895,7 +911,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
prev = vma;
}
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
mmap_write_unlock(mm);
@@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx.ctx = ctx;
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
@@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* the next vma was merged into the current one and
* the current one has not been updated yet.
*/
- vma->vm_flags = new_flags;
+ userfaultfd_set_vm_flags(vma, new_flags);
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
skip:
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 49d0d4ea63fc..0d56a8d862e8 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata(
* the busy list.
*/
bma->datatype = XFS_ALLOC_NOBUSY;
- if (whichfork == XFS_DATA_FORK) {
+ if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) {
bma->datatype |= XFS_ALLOC_USERDATA;
if (bma->offset == 0)
bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
@@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
+ xfs_iomap_inode_sequence(ip, flags));
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
@@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
+ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
+ xfs_iomap_inode_sequence(ip, flags));
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 4c16c8c31fcb..35f574421670 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4666,7 +4666,12 @@ xfs_btree_space_to_height(
const unsigned int *limits,
unsigned long long leaf_blocks)
{
- unsigned long long node_blocks = limits[1];
+ /*
+ * The root btree block can have fewer than minrecs pointers in it
+ * because the tree might not be big enough to require that amount of
+ * fanout. Hence it has a minimum size of 2 pointers, not limits[1].
+ */
+ unsigned long long node_blocks = 2;
unsigned long long blocks_left = leaf_blocks - 1;
unsigned int height = 1;
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index eef27858a013..29c4b4ccb909 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -556,7 +556,6 @@ xfs_btree_islastblock(
struct xfs_buf *bp;
block = xfs_btree_get_block(cur, level, &bp);
- ASSERT(block && xfs_btree_check_block(cur, block, level, bp) == 0);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h
index 5362908164b0..01a9e86b3037 100644
--- a/fs/xfs/libxfs/xfs_errortag.h
+++ b/fs/xfs/libxfs/xfs_errortag.h
@@ -40,13 +40,12 @@
#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
#define XFS_ERRTAG_BMAP_FINISH_ONE 26
#define XFS_ERRTAG_AG_RESV_CRITICAL 27
+
/*
- * DEBUG mode instrumentation to test and/or trigger delayed allocation
- * block killing in the event of failed writes. When enabled, all
- * buffered writes are silenty dropped and handled as if they failed.
- * All delalloc blocks in the range of the write (including pre-existing
- * delalloc blocks!) are tossed as part of the write failure error
- * handling sequence.
+ * Drop-writes support removed because write error handling cannot trash
+ * pre-existing delalloc extents in any useful way anymore. We retain the
+ * definition so that we can reject it as an invalid value in
+ * xfs_errortag_valid().
*/
#define XFS_ERRTAG_DROP_WRITES 28
#define XFS_ERRTAG_LOG_BAD_CRC 29
@@ -62,7 +61,9 @@
#define XFS_ERRTAG_LARP 39
#define XFS_ERRTAG_DA_LEAF_SPLIT 40
#define XFS_ERRTAG_ATTR_LEAF_TO_NODE 41
-#define XFS_ERRTAG_MAX 42
+#define XFS_ERRTAG_WB_DELAY_MS 42
+#define XFS_ERRTAG_WRITE_DELAY_MS 43
+#define XFS_ERRTAG_MAX 44
/*
* Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -95,7 +96,6 @@
#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
#define XFS_RANDOM_BMAP_FINISH_ONE 1
#define XFS_RANDOM_AG_RESV_CRITICAL 4
-#define XFS_RANDOM_DROP_WRITES 1
#define XFS_RANDOM_LOG_BAD_CRC 1
#define XFS_RANDOM_LOG_ITEM_PIN 1
#define XFS_RANDOM_BUF_LRU_REF 2
@@ -109,5 +109,7 @@
#define XFS_RANDOM_LARP 1
#define XFS_RANDOM_DA_LEAF_SPLIT 1
#define XFS_RANDOM_ATTR_LEAF_TO_NODE 1
+#define XFS_RANDOM_WB_DELAY_MS 3000
+#define XFS_RANDOM_WRITE_DELAY_MS 3000
#endif /* __XFS_ERRORTAG_H_ */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 3f34bafe18dd..6f7ed9288fe4 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -815,11 +815,136 @@ out_error:
/* Is this extent valid? */
static inline bool
xfs_refc_valid(
- struct xfs_refcount_irec *rc)
+ const struct xfs_refcount_irec *rc)
{
return rc->rc_startblock != NULLAGBLOCK;
}
+static inline xfs_nlink_t
+xfs_refc_merge_refcount(
+ const struct xfs_refcount_irec *irec,
+ enum xfs_refc_adjust_op adjust)
+{
+ /* Once a record hits MAXREFCOUNT, it is pinned there forever */
+ if (irec->rc_refcount == MAXREFCOUNT)
+ return MAXREFCOUNT;
+ return irec->rc_refcount + adjust;
+}
+
+static inline bool
+xfs_refc_want_merge_center(
+ const struct xfs_refcount_irec *left,
+ const struct xfs_refcount_irec *cleft,
+ const struct xfs_refcount_irec *cright,
+ const struct xfs_refcount_irec *right,
+ bool cleft_is_cright,
+ enum xfs_refc_adjust_op adjust,
+ unsigned long long *ulenp)
+{
+ unsigned long long ulen = left->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * To merge with a center record, both shoulder records must be
+ * adjacent to the record we want to adjust. This is only true if
+ * find_left and find_right made all four records valid.
+ */
+ if (!xfs_refc_valid(left) || !xfs_refc_valid(right) ||
+ !xfs_refc_valid(cleft) || !xfs_refc_valid(cright))
+ return false;
+
+ /* There must only be one record for the entire range. */
+ if (!cleft_is_cright)
+ return false;
+
+ /* The shoulder record refcounts must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cleft, adjust);
+ if (left->rc_refcount != new_refcount)
+ return false;
+ if (right->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cleft->rc_blockcount + right->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ *ulenp = ulen;
+ return true;
+}
+
+static inline bool
+xfs_refc_want_merge_left(
+ const struct xfs_refcount_irec *left,
+ const struct xfs_refcount_irec *cleft,
+ enum xfs_refc_adjust_op adjust)
+{
+ unsigned long long ulen = left->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * For a left merge, the left shoulder record must be adjacent to the
+ * start of the range. If this is true, find_left made left and cleft
+ * contain valid contents.
+ */
+ if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft))
+ return false;
+
+ /* Left shoulder record refcount must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cleft, adjust);
+ if (left->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cleft->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ return true;
+}
+
+static inline bool
+xfs_refc_want_merge_right(
+ const struct xfs_refcount_irec *cright,
+ const struct xfs_refcount_irec *right,
+ enum xfs_refc_adjust_op adjust)
+{
+ unsigned long long ulen = right->rc_blockcount;
+ xfs_nlink_t new_refcount;
+
+ /*
+ * For a right merge, the right shoulder record must be adjacent to the
+ * end of the range. If this is true, find_right made cright and right
+ * contain valid contents.
+ */
+ if (!xfs_refc_valid(right) || !xfs_refc_valid(cright))
+ return false;
+
+ /* Right shoulder record refcount must match the new refcount. */
+ new_refcount = xfs_refc_merge_refcount(cright, adjust);
+ if (right->rc_refcount != new_refcount)
+ return false;
+
+ /*
+ * The new record cannot exceed the max length. ulen is a ULL as the
+ * individual record block counts can be up to (u32 - 1) in length
+ * hence we need to catch u32 addition overflows here.
+ */
+ ulen += cright->rc_blockcount;
+ if (ulen >= MAXREFCEXTLEN)
+ return false;
+
+ return true;
+}
+
/*
* Try to merge with any extents on the boundaries of the adjustment range.
*/
@@ -861,23 +986,15 @@ xfs_refcount_merge_extents(
(cleft.rc_blockcount == cright.rc_blockcount);
/* Try to merge left, cleft, and right. cleft must == cright. */
- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
- right.rc_blockcount;
- if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
- xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
- left.rc_refcount == cleft.rc_refcount + adjust &&
- right.rc_refcount == cleft.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal,
+ adjust, &ulen)) {
*shape_changed = true;
return xfs_refcount_merge_center_extents(cur, &left, &cleft,
&right, ulen, aglen);
}
/* Try to merge left and cleft. */
- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
- if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
- left.rc_refcount == cleft.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_left(&left, &cleft, adjust)) {
*shape_changed = true;
error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
agbno, aglen);
@@ -893,10 +1010,7 @@ xfs_refcount_merge_extents(
}
/* Try to merge cright and right. */
- ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
- if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
- right.rc_refcount == cright.rc_refcount + adjust &&
- ulen < MAXREFCEXTLEN) {
+ if (xfs_refc_want_merge_right(&cright, &right, adjust)) {
*shape_changed = true;
return xfs_refcount_merge_right_extent(cur, &right, &cright,
aglen);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a20cade590e9..1eeecf2eb2a7 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -972,7 +972,9 @@ xfs_log_sb(
*/
if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
- mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+ mp->m_sb.sb_ifree = min_t(uint64_t,
+ percpu_counter_sum(&mp->m_ifree),
+ mp->m_sb.sb_icount);
mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
}
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index b7b838bd4ba4..4dd52b15f09c 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -609,9 +609,16 @@ out:
/* AGFL */
struct xchk_agfl_info {
- unsigned int sz_entries;
+ /* Number of AGFL entries that the AGF claims are in use. */
+ unsigned int agflcount;
+
+ /* Number of AGFL entries that we found. */
unsigned int nr_entries;
+
+ /* Buffer to hold AGFL entries for extent checking. */
xfs_agblock_t *entries;
+
+ struct xfs_buf *agfl_bp;
struct xfs_scrub *sc;
};
@@ -641,10 +648,10 @@ xchk_agfl_block(
struct xfs_scrub *sc = sai->sc;
if (xfs_verify_agbno(sc->sa.pag, agbno) &&
- sai->nr_entries < sai->sz_entries)
+ sai->nr_entries < sai->agflcount)
sai->entries[sai->nr_entries++] = agbno;
else
- xchk_block_set_corrupt(sc, sc->sa.agfl_bp);
+ xchk_block_set_corrupt(sc, sai->agfl_bp);
xchk_agfl_block_xref(sc, agbno);
@@ -696,19 +703,26 @@ int
xchk_agfl(
struct xfs_scrub *sc)
{
- struct xchk_agfl_info sai;
+ struct xchk_agfl_info sai = {
+ .sc = sc,
+ };
struct xfs_agf *agf;
xfs_agnumber_t agno = sc->sm->sm_agno;
- unsigned int agflcount;
unsigned int i;
int error;
+ /* Lock the AGF and AGI so that nobody can touch this AG. */
error = xchk_ag_read_headers(sc, agno, &sc->sa);
if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
- goto out;
+ return error;
if (!sc->sa.agf_bp)
return -EFSCORRUPTED;
- xchk_buffer_recheck(sc, sc->sa.agfl_bp);
+
+ /* Try to read the AGFL, and verify its structure if we get it. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &sai.agfl_bp);
+ if (!xchk_process_error(sc, agno, XFS_AGFL_BLOCK(sc->mp), &error))
+ return error;
+ xchk_buffer_recheck(sc, sai.agfl_bp);
xchk_agfl_xref(sc);
@@ -717,24 +731,21 @@ xchk_agfl(
/* Allocate buffer to ensure uniqueness of AGFL entries. */
agf = sc->sa.agf_bp->b_addr;
- agflcount = be32_to_cpu(agf->agf_flcount);
- if (agflcount > xfs_agfl_size(sc->mp)) {
+ sai.agflcount = be32_to_cpu(agf->agf_flcount);
+ if (sai.agflcount > xfs_agfl_size(sc->mp)) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
goto out;
}
- memset(&sai, 0, sizeof(sai));
- sai.sc = sc;
- sai.sz_entries = agflcount;
- sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount,
- KM_MAYFAIL);
+ sai.entries = kvcalloc(sai.agflcount, sizeof(xfs_agblock_t),
+ XCHK_GFP_FLAGS);
if (!sai.entries) {
error = -ENOMEM;
goto out;
}
/* Check the blocks in the AGFL. */
- error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr,
- sc->sa.agfl_bp, xchk_agfl_block, &sai);
+ error = xfs_agfl_walk(sc->mp, sc->sa.agf_bp->b_addr, sai.agfl_bp,
+ xchk_agfl_block, &sai);
if (error == -ECANCELED) {
error = 0;
goto out_free;
@@ -742,7 +753,7 @@ xchk_agfl(
if (error)
goto out_free;
- if (agflcount != sai.nr_entries) {
+ if (sai.agflcount != sai.nr_entries) {
xchk_block_set_corrupt(sc, sc->sa.agf_bp);
goto out_free;
}
@@ -758,7 +769,7 @@ xchk_agfl(
}
out_free:
- kmem_free(sai.entries);
+ kvfree(sai.entries);
out:
return error;
}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 1b0b4e243f77..d75d82151eeb 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -442,12 +442,18 @@ out_revert:
/* AGFL */
struct xrep_agfl {
+ /* Bitmap of alleged AGFL blocks that we're not going to add. */
+ struct xbitmap crossed;
+
/* Bitmap of other OWN_AG metadata blocks. */
struct xbitmap agmetablocks;
/* Bitmap of free space. */
struct xbitmap *freesp;
+ /* rmapbt cursor for finding crosslinked blocks */
+ struct xfs_btree_cur *rmap_cur;
+
struct xfs_scrub *sc;
};
@@ -477,6 +483,41 @@ xrep_agfl_walk_rmap(
return xbitmap_set_btcur_path(&ra->agmetablocks, cur);
}
+/* Strike out the blocks that are cross-linked according to the rmapbt. */
+STATIC int
+xrep_agfl_check_extent(
+ struct xrep_agfl *ra,
+ uint64_t start,
+ uint64_t len)
+{
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(ra->sc->mp, start);
+ xfs_agblock_t last_agbno = agbno + len - 1;
+ int error;
+
+ ASSERT(XFS_FSB_TO_AGNO(ra->sc->mp, start) == ra->sc->sa.pag->pag_agno);
+
+ while (agbno <= last_agbno) {
+ bool other_owners;
+
+ error = xfs_rmap_has_other_keys(ra->rmap_cur, agbno, 1,
+ &XFS_RMAP_OINFO_AG, &other_owners);
+ if (error)
+ return error;
+
+ if (other_owners) {
+ error = xbitmap_set(&ra->crossed, agbno, 1);
+ if (error)
+ return error;
+ }
+
+ if (xchk_should_terminate(ra->sc, &error))
+ return error;
+ agbno++;
+ }
+
+ return 0;
+}
+
/*
* Map out all the non-AGFL OWN_AG space in this AG so that we can deduce
* which blocks belong to the AGFL.
@@ -496,44 +537,58 @@ xrep_agfl_collect_blocks(
struct xrep_agfl ra;
struct xfs_mount *mp = sc->mp;
struct xfs_btree_cur *cur;
+ struct xbitmap_range *br, *n;
int error;
ra.sc = sc;
ra.freesp = agfl_extents;
xbitmap_init(&ra.agmetablocks);
+ xbitmap_init(&ra.crossed);
/* Find all space used by the free space btrees & rmapbt. */
cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
error = xfs_rmap_query_all(cur, xrep_agfl_walk_rmap, &ra);
- if (error)
- goto err;
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/* Find all blocks currently being used by the bnobt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_BNO);
error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
- if (error)
- goto err;
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/* Find all blocks currently being used by the cntbt. */
cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
sc->sa.pag, XFS_BTNUM_CNT);
error = xbitmap_set_btblocks(&ra.agmetablocks, cur);
- if (error)
- goto err;
-
xfs_btree_del_cursor(cur, error);
+ if (error)
+ goto out_bmp;
/*
* Drop the freesp meta blocks that are in use by btrees.
* The remaining blocks /should/ be AGFL blocks.
*/
error = xbitmap_disunion(agfl_extents, &ra.agmetablocks);
- xbitmap_destroy(&ra.agmetablocks);
if (error)
- return error;
+ goto out_bmp;
+
+ /* Strike out the blocks that are cross-linked. */
+ ra.rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
+ for_each_xbitmap_extent(br, n, agfl_extents) {
+ error = xrep_agfl_check_extent(&ra, br->start, br->len);
+ if (error)
+ break;
+ }
+ xfs_btree_del_cursor(ra.rmap_cur, error);
+ if (error)
+ goto out_bmp;
+ error = xbitmap_disunion(agfl_extents, &ra.crossed);
+ if (error)
+ goto out_bmp;
/*
* Calculate the new AGFL size. If we found more blocks than fit in
@@ -541,11 +596,10 @@ xrep_agfl_collect_blocks(
*/
*flcount = min_t(uint64_t, xbitmap_hweight(agfl_extents),
xfs_agfl_size(mp));
- return 0;
-err:
+out_bmp:
+ xbitmap_destroy(&ra.crossed);
xbitmap_destroy(&ra.agmetablocks);
- xfs_btree_del_cursor(cur, error);
return error;
}
@@ -631,7 +685,7 @@ xrep_agfl_init_header(
if (br->len)
break;
list_del(&br->list);
- kmem_free(br);
+ kfree(br);
}
/* Write new AGFL to disk. */
@@ -697,7 +751,6 @@ xrep_agfl(
* freespace overflow to the freespace btrees.
*/
sc->sa.agf_bp = agf_bp;
- sc->sa.agfl_bp = agfl_bp;
error = xrep_roll_ag_trans(sc);
if (error)
goto err;
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index b6f0c9f3f124..31529b9bf389 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -49,7 +49,7 @@ xchk_setup_xattr_buf(
if (ab) {
if (sz <= ab->sz)
return 0;
- kmem_free(ab);
+ kvfree(ab);
sc->buf = NULL;
}
@@ -79,7 +79,8 @@ xchk_setup_xattr(
* without the inode lock held, which means we can sleep.
*/
if (sc->flags & XCHK_TRY_HARDER) {
- error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL);
+ error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX,
+ XCHK_GFP_FLAGS);
if (error)
return error;
}
@@ -138,8 +139,7 @@ xchk_xattr_listent(
* doesn't work, we overload the seen_enough variable to convey
* the error message back to the main scrub function.
*/
- error = xchk_setup_xattr_buf(sx->sc, valuelen,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ error = xchk_setup_xattr_buf(sx->sc, valuelen, XCHK_GFP_FLAGS);
if (error == -ENOMEM)
error = -EDEADLOCK;
if (error) {
@@ -324,8 +324,7 @@ xchk_xattr_block(
return 0;
/* Allocate memory for block usage checking. */
- error = xchk_setup_xattr_buf(ds->sc, 0,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ error = xchk_setup_xattr_buf(ds->sc, 0, XCHK_GFP_FLAGS);
if (error == -ENOMEM)
return -EDEADLOCK;
if (error)
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index b89bf9de9b1c..a255f09e9f0a 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -10,6 +10,7 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
+#include "scrub/scrub.h"
#include "scrub/bitmap.h"
/*
@@ -25,7 +26,7 @@ xbitmap_set(
{
struct xbitmap_range *bmr;
- bmr = kmem_alloc(sizeof(struct xbitmap_range), KM_MAYFAIL);
+ bmr = kmalloc(sizeof(struct xbitmap_range), XCHK_GFP_FLAGS);
if (!bmr)
return -ENOMEM;
@@ -47,7 +48,7 @@ xbitmap_destroy(
for_each_xbitmap_extent(bmr, n, bitmap) {
list_del(&bmr->list);
- kmem_free(bmr);
+ kfree(bmr);
}
}
@@ -174,15 +175,15 @@ xbitmap_disunion(
/* Total overlap, just delete ex. */
lp = lp->next;
list_del(&br->list);
- kmem_free(br);
+ kfree(br);
break;
case 0:
/*
* Deleting from the middle: add the new right extent
* and then shrink the left extent.
*/
- new_br = kmem_alloc(sizeof(struct xbitmap_range),
- KM_MAYFAIL);
+ new_br = kmalloc(sizeof(struct xbitmap_range),
+ XCHK_GFP_FLAGS);
if (!new_br) {
error = -ENOMEM;
goto out;
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index f0b9cb6506fd..d50d0eab196a 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -90,6 +90,7 @@ out:
struct xchk_bmap_info {
struct xfs_scrub *sc;
+ struct xfs_iext_cursor icur;
xfs_fileoff_t lastoff;
bool is_rt;
bool is_shared;
@@ -146,6 +147,48 @@ xchk_bmap_get_rmap(
return has_rmap;
}
+static inline bool
+xchk_bmap_has_prev(
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+ if (!xfs_iext_peek_prev_extent(ifp, &info->icur, &got))
+ return false;
+ if (got.br_startoff + got.br_blockcount != irec->br_startoff)
+ return false;
+ if (got.br_startblock + got.br_blockcount != irec->br_startblock)
+ return false;
+ if (got.br_state != irec->br_state)
+ return false;
+ return true;
+}
+
+static inline bool
+xchk_bmap_has_next(
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+
+ ifp = xfs_ifork_ptr(info->sc->ip, info->whichfork);
+
+ if (!xfs_iext_peek_next_extent(ifp, &info->icur, &got))
+ return false;
+ if (irec->br_startoff + irec->br_blockcount != got.br_startoff)
+ return false;
+ if (irec->br_startblock + irec->br_blockcount != got.br_startblock)
+ return false;
+ if (got.br_state != irec->br_state)
+ return false;
+ return true;
+}
+
/* Make sure that we have rmapbt records for this extent. */
STATIC void
xchk_bmap_xref_rmap(
@@ -214,6 +257,34 @@ xchk_bmap_xref_rmap(
if (rmap.rm_flags & XFS_RMAP_BMBT_BLOCK)
xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
+
+ /*
+ * If the rmap starts before this bmbt record, make sure there's a bmbt
+ * record for the previous offset that is contiguous with this mapping.
+ * Skip this for CoW fork extents because the refcount btree (and not
+ * the inode) is the ondisk owner for those extents.
+ */
+ if (info->whichfork != XFS_COW_FORK && rmap.rm_startblock < agbno &&
+ !xchk_bmap_has_prev(info, irec)) {
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ return;
+ }
+
+ /*
+ * If the rmap ends after this bmbt record, make sure there's a bmbt
+ * record for the next offset that is contiguous with this mapping.
+ * Skip this for CoW fork extents because the refcount btree (and not
+ * the inode) is the ondisk owner for those extents.
+ */
+ rmap_end = (unsigned long long)rmap.rm_startblock + rmap.rm_blockcount;
+ if (info->whichfork != XFS_COW_FORK &&
+ rmap_end > agbno + irec->br_blockcount &&
+ !xchk_bmap_has_next(info, irec)) {
+ xchk_fblock_xref_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+ return;
+ }
}
/* Cross-reference a single rtdev extent record. */
@@ -264,6 +335,8 @@ xchk_bmap_iextent_xref(
case XFS_COW_FORK:
xchk_xref_is_cow_staging(info->sc, agbno,
irec->br_blockcount);
+ xchk_xref_is_not_shared(info->sc, agbno,
+ irec->br_blockcount);
break;
}
@@ -297,14 +370,13 @@ xchk_bmap_dirattr_extent(
}
/* Scrub a single extent record. */
-STATIC int
+STATIC void
xchk_bmap_iextent(
struct xfs_inode *ip,
struct xchk_bmap_info *info,
struct xfs_bmbt_irec *irec)
{
struct xfs_mount *mp = info->sc->mp;
- int error = 0;
/*
* Check for out-of-order extents. This record could have come
@@ -325,14 +397,6 @@ xchk_bmap_iextent(
xchk_fblock_set_corrupt(info->sc, info->whichfork,
irec->br_startoff);
- /*
- * Check for delalloc extents. We never iterate the ones in the
- * in-core extent scan, and we should never see these in the bmbt.
- */
- if (isnullstartblock(irec->br_startblock))
- xchk_fblock_set_corrupt(info->sc, info->whichfork,
- irec->br_startoff);
-
/* Make sure the extent points to a valid place. */
if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
xchk_fblock_set_corrupt(info->sc, info->whichfork,
@@ -353,15 +417,12 @@ xchk_bmap_iextent(
irec->br_startoff);
if (info->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- return 0;
+ return;
if (info->is_rt)
xchk_bmap_rt_iextent_xref(ip, info, irec);
else
xchk_bmap_iextent_xref(ip, info, irec);
-
- info->lastoff = irec->br_startoff + irec->br_blockcount;
- return error;
}
/* Scrub a bmbt record. */
@@ -599,14 +660,41 @@ xchk_bmap_check_rmaps(
for_each_perag(sc->mp, agno, pag) {
error = xchk_bmap_check_ag_rmaps(sc, whichfork, pag);
- if (error)
- break;
- if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
- break;
+ if (error ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+ xfs_perag_put(pag);
+ return error;
+ }
}
- if (pag)
- xfs_perag_put(pag);
- return error;
+
+ return 0;
+}
+
+/* Scrub a delalloc reservation from the incore extent map tree. */
+STATIC void
+xchk_bmap_iextent_delalloc(
+ struct xfs_inode *ip,
+ struct xchk_bmap_info *info,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_mount *mp = info->sc->mp;
+
+ /*
+ * Check for out-of-order extents. This record could have come
+ * from the incore list, for which there is no ordering check.
+ */
+ if (irec->br_startoff < info->lastoff)
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
+
+ /* Make sure the extent points to a valid place. */
+ if (irec->br_blockcount > XFS_MAX_BMBT_EXTLEN)
+ xchk_fblock_set_corrupt(info->sc, info->whichfork,
+ irec->br_startoff);
}
/*
@@ -626,7 +714,6 @@ xchk_bmap(
struct xfs_inode *ip = sc->ip;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t endoff;
- struct xfs_iext_cursor icur;
int error = 0;
/* Non-existent forks can be ignored. */
@@ -661,6 +748,8 @@ xchk_bmap(
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_LOCAL:
/* No mappings to check. */
+ if (whichfork == XFS_COW_FORK)
+ xchk_fblock_set_corrupt(sc, whichfork, 0);
goto out;
case XFS_DINODE_FMT_EXTENTS:
break;
@@ -690,20 +779,22 @@ xchk_bmap(
/* Scrub extent records. */
info.lastoff = 0;
ifp = xfs_ifork_ptr(ip, whichfork);
- for_each_xfs_iext(ifp, &icur, &irec) {
+ for_each_xfs_iext(ifp, &info.icur, &irec) {
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
goto out;
- if (isnullstartblock(irec.br_startblock))
- continue;
+
if (irec.br_startoff >= endoff) {
xchk_fblock_set_corrupt(sc, whichfork,
irec.br_startoff);
goto out;
}
- error = xchk_bmap_iextent(ip, &info, &irec);
- if (error)
- goto out;
+
+ if (isnullstartblock(irec.br_startblock))
+ xchk_bmap_iextent_delalloc(ip, &info, &irec);
+ else
+ xchk_bmap_iextent(ip, &info, &irec);
+ info.lastoff = irec.br_startoff + irec.br_blockcount;
}
error = xchk_bmap_check_rmaps(sc, whichfork);
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 2f4519590dc1..0fd36d5b4646 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -408,7 +408,6 @@ xchk_btree_check_owner(
struct xfs_buf *bp)
{
struct xfs_btree_cur *cur = bs->cur;
- struct check_owner *co;
/*
* In theory, xfs_btree_get_block should only give us a null buffer
@@ -431,10 +430,13 @@ xchk_btree_check_owner(
* later scanning.
*/
if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
- co = kmem_alloc(sizeof(struct check_owner),
- KM_MAYFAIL);
+ struct check_owner *co;
+
+ co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
if (!co)
return -ENOMEM;
+
+ INIT_LIST_HEAD(&co->list);
co->level = level;
co->daddr = xfs_buf_daddr(bp);
list_add_tail(&co->list, &bs->to_check);
@@ -649,7 +651,7 @@ xchk_btree(
xchk_btree_set_corrupt(sc, cur, 0);
return 0;
}
- bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL);
+ bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
if (!bs)
return -ENOMEM;
bs->cur = cur;
@@ -740,9 +742,9 @@ out:
error = xchk_btree_check_block_owner(bs, co->level,
co->daddr);
list_del(&co->list);
- kmem_free(co);
+ kfree(co);
}
- kmem_free(bs);
+ kfree(bs);
return error;
}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 9bbbf20f401b..613260b04a3d 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -424,10 +424,6 @@ xchk_ag_read_headers(
if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF))
return error;
- error = xfs_alloc_read_agfl(sa->pag, sc->tp, &sa->agfl_bp);
- if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL))
- return error;
-
return 0;
}
@@ -515,10 +511,6 @@ xchk_ag_free(
struct xchk_ag *sa)
{
xchk_ag_btcur_free(sa);
- if (sa->agfl_bp) {
- xfs_trans_brelse(sc->tp, sa->agfl_bp);
- sa->agfl_bp = NULL;
- }
if (sa->agf_bp) {
xfs_trans_brelse(sc->tp, sa->agf_bp);
sa->agf_bp = NULL;
@@ -789,6 +781,33 @@ xchk_buffer_recheck(
trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa);
}
+static inline int
+xchk_metadata_inode_subtype(
+ struct xfs_scrub *sc,
+ unsigned int scrub_type)
+{
+ __u32 smtype = sc->sm->sm_type;
+ int error;
+
+ sc->sm->sm_type = scrub_type;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ break;
+ }
+
+ sc->sm->sm_type = smtype;
+ return error;
+}
+
/*
* Scrub the attr/data forks of a metadata inode. The metadata inode must be
* pointed to by sc->ip and the ILOCK must be held.
@@ -797,13 +816,17 @@ int
xchk_metadata_inode_forks(
struct xfs_scrub *sc)
{
- __u32 smtype;
bool shared;
int error;
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return 0;
+ /* Check the inode record. */
+ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
+ if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
+ return error;
+
/* Metadata inodes don't live on the rt device. */
if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -823,10 +846,7 @@ xchk_metadata_inode_forks(
}
/* Invoke the data fork scrubber. */
- smtype = sc->sm->sm_type;
- sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD;
- error = xchk_bmap_data(sc);
- sc->sm->sm_type = smtype;
+ error = xchk_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
@@ -841,7 +861,7 @@ xchk_metadata_inode_forks(
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
}
- return error;
+ return 0;
}
/*
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 454145db10e7..b73648d81d23 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -25,7 +25,7 @@ xchk_should_terminate(
if (fatal_signal_pending(current)) {
if (*error == 0)
- *error = -EAGAIN;
+ *error = -EINTR;
return true;
}
return false;
diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c
index 84fe3d33d699..d17cee177085 100644
--- a/fs/xfs/scrub/dabtree.c
+++ b/fs/xfs/scrub/dabtree.c
@@ -486,7 +486,7 @@ xchk_da_btree(
return 0;
/* Set up initial da state. */
- ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL);
+ ds = kzalloc(sizeof(struct xchk_da_btree), XCHK_GFP_FLAGS);
if (!ds)
return -ENOMEM;
ds->dargs.dp = sc->ip;
@@ -591,6 +591,6 @@ out:
out_state:
xfs_da_state_free(ds->state);
- kmem_free(ds);
+ kfree(ds);
return error;
}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 5c87800ab223..d1b0f23c2c59 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -666,7 +666,12 @@ xchk_directory_blocks(
struct xfs_scrub *sc)
{
struct xfs_bmbt_irec got;
- struct xfs_da_args args;
+ struct xfs_da_args args = {
+ .dp = sc ->ip,
+ .whichfork = XFS_DATA_FORK,
+ .geo = sc->mp->m_dir_geo,
+ .trans = sc->tp,
+ };
struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
struct xfs_mount *mp = sc->mp;
xfs_fileoff_t leaf_lblk;
@@ -689,9 +694,6 @@ xchk_directory_blocks(
free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
/* Is this a block dir? */
- args.dp = sc->ip;
- args.geo = mp->m_dir_geo;
- args.trans = sc->tp;
error = xfs_dir2_isblock(&args, &is_block);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
goto out;
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 6a6f8fe7f87c..4777e7b89fdc 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -14,6 +14,8 @@
#include "xfs_health.h"
#include "xfs_btree.h"
#include "xfs_ag.h"
+#include "xfs_rtalloc.h"
+#include "xfs_inode.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -43,6 +45,16 @@
* our tolerance for mismatch between expected and actual counter values.
*/
+struct xchk_fscounters {
+ struct xfs_scrub *sc;
+ uint64_t icount;
+ uint64_t ifree;
+ uint64_t fdblocks;
+ uint64_t frextents;
+ unsigned long long icount_min;
+ unsigned long long icount_max;
+};
+
/*
* Since the expected value computation is lockless but only browses incore
* values, the percpu counters should be fairly close to each other. However,
@@ -116,10 +128,11 @@ xchk_setup_fscounters(
struct xchk_fscounters *fsc;
int error;
- sc->buf = kmem_zalloc(sizeof(struct xchk_fscounters), 0);
+ sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
fsc = sc->buf;
+ fsc->sc = sc;
xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max);
@@ -138,6 +151,18 @@ xchk_setup_fscounters(
return xchk_trans_alloc(sc, 0);
}
+/*
+ * Part 1: Collecting filesystem summary counts. For each AG, we add its
+ * summary counts (total inodes, free inodes, free data blocks) to an incore
+ * copy of the overall filesystem summary counts.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned. This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error.
+ */
+
/* Count free space btree blocks manually for pre-lazysbcount filesystems. */
static int
xchk_fscount_btreeblks(
@@ -225,8 +250,10 @@ retry:
}
if (pag)
xfs_perag_put(pag);
- if (error)
+ if (error) {
+ xchk_set_incomplete(sc);
return error;
+ }
/*
* The global incore space reservation is taken from the incore
@@ -267,6 +294,64 @@ retry:
return 0;
}
+#ifdef CONFIG_XFS_RT
+STATIC int
+xchk_fscount_add_frextent(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ const struct xfs_rtalloc_rec *rec,
+ void *priv)
+{
+ struct xchk_fscounters *fsc = priv;
+ int error = 0;
+
+ fsc->frextents += rec->ar_extcount;
+
+ xchk_should_terminate(fsc->sc, &error);
+ return error;
+}
+
+/* Calculate the number of free realtime extents from the realtime bitmap. */
+STATIC int
+xchk_fscount_count_frextents(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ struct xfs_mount *mp = sc->mp;
+ int error;
+
+ fsc->frextents = 0;
+ if (!xfs_has_realtime(mp))
+ return 0;
+
+ xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ error = xfs_rtalloc_query_all(sc->mp, sc->tp,
+ xchk_fscount_add_frextent, fsc);
+ if (error) {
+ xchk_set_incomplete(sc);
+ goto out_unlock;
+ }
+
+out_unlock:
+ xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
+ return error;
+}
+#else
+STATIC int
+xchk_fscount_count_frextents(
+ struct xfs_scrub *sc,
+ struct xchk_fscounters *fsc)
+{
+ fsc->frextents = 0;
+ return 0;
+}
+#endif /* CONFIG_XFS_RT */
+
+/*
+ * Part 2: Comparing filesystem summary counters. All we have to do here is
+ * sum the percpu counters and compare them to what we've observed.
+ */
+
/*
* Is the @counter reasonably close to the @expected value?
*
@@ -333,16 +418,17 @@ xchk_fscounters(
{
struct xfs_mount *mp = sc->mp;
struct xchk_fscounters *fsc = sc->buf;
- int64_t icount, ifree, fdblocks;
+ int64_t icount, ifree, fdblocks, frextents;
int error;
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ frextents = percpu_counter_sum(&mp->m_frextents);
/* No negative values, please! */
- if (icount < 0 || ifree < 0 || fdblocks < 0)
+ if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
xchk_set_corrupt(sc);
/* See if icount is obviously wrong. */
@@ -353,6 +439,10 @@ xchk_fscounters(
if (fdblocks > mp->m_sb.sb_dblocks)
xchk_set_corrupt(sc);
+ /* See if frextents is obviously wrong. */
+ if (frextents > mp->m_sb.sb_rextents)
+ xchk_set_corrupt(sc);
+
/*
* If ifree exceeds icount by more than the minimum variance then
* something's probably wrong with the counters.
@@ -367,6 +457,13 @@ xchk_fscounters(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
return 0;
+ /* Count the free extents counter for rt volumes. */
+ error = xchk_fscount_count_frextents(sc, fsc);
+ if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
+ return error;
+ if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+ return 0;
+
/* Compare the in-core counters with whatever we counted. */
if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
xchk_set_corrupt(sc);
@@ -378,5 +475,9 @@ xchk_fscounters(
fsc->fdblocks))
xchk_set_corrupt(sc);
+ if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+ fsc->frextents))
+ xchk_set_corrupt(sc);
+
return 0;
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 51820b40ab1c..7a2f38e5202c 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -365,7 +365,7 @@ xchk_dinode(
* pagecache can't cache all the blocks in this file due to
* overly large offsets, flag the inode for admin review.
*/
- if (isize >= mp->m_super->s_maxbytes)
+ if (isize > mp->m_super->s_maxbytes)
xchk_ino_set_warning(sc, ino);
/* di_nblocks */
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 21b4c9006859..9eeac8565394 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -14,6 +14,7 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
+#include "xfs_bmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -84,7 +85,7 @@ xchk_quota_item(
int error = 0;
if (xchk_should_terminate(sc, &error))
- return -ECANCELED;
+ return error;
/*
* Except for the root dquot, the actual dquot we got must either have
@@ -189,11 +190,12 @@ xchk_quota_data_fork(
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
break;
+
/*
- * delalloc extents or blocks mapped above the highest
+ * delalloc/unwritten extents or blocks mapped above the highest
* quota id shouldn't happen.
*/
- if (isnullstartblock(irec.br_startblock) ||
+ if (!xfs_bmap_is_written_extent(&irec) ||
irec.br_startoff > max_dqid_off ||
irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index a26ee0f24ef2..d9c1b3cea4a5 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -127,8 +127,8 @@ xchk_refcountbt_rmap_check(
* is healthy each rmap_irec we see will be in agbno order
* so we don't need insertion sort here.
*/
- frag = kmem_alloc(sizeof(struct xchk_refcnt_frag),
- KM_MAYFAIL);
+ frag = kmalloc(sizeof(struct xchk_refcnt_frag),
+ XCHK_GFP_FLAGS);
if (!frag)
return -ENOMEM;
memcpy(&frag->rm, rec, sizeof(frag->rm));
@@ -215,7 +215,7 @@ xchk_refcountbt_process_rmap_fragments(
continue;
}
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
nr++;
}
@@ -257,11 +257,11 @@ done:
/* Delete fragments and work list. */
list_for_each_entry_safe(frag, n, &worklist, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
list_for_each_entry_safe(frag, n, &refchk->fragments, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
}
@@ -306,7 +306,7 @@ xchk_refcountbt_xref_rmap(
out_free:
list_for_each_entry_safe(frag, n, &refchk.fragments, list) {
list_del(&frag->list);
- kmem_free(frag);
+ kfree(frag);
}
}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index c18bd039fce9..4b92f9253ccd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -61,7 +61,6 @@ xrep_attempt(
sc->flags |= XREP_ALREADY_FIXED;
return -EAGAIN;
case -EDEADLOCK:
- case -EAGAIN:
/* Tell the caller to try again having grabbed all the locks. */
if (!(sc->flags & XCHK_TRY_HARDER)) {
sc->flags |= XCHK_TRY_HARDER;
@@ -70,10 +69,15 @@ xrep_attempt(
/*
* We tried harder but still couldn't grab all the resources
* we needed to fix it. The corruption has not been fixed,
- * so report back to userspace.
+ * so exit to userspace with the scan's output flags unchanged.
*/
- return -EFSCORRUPTED;
+ return 0;
default:
+ /*
+ * EAGAIN tells the caller to re-scrub, so we cannot return
+ * that here.
+ */
+ ASSERT(error != -EAGAIN);
return error;
}
}
@@ -121,32 +125,40 @@ xrep_roll_ag_trans(
{
int error;
- /* Keep the AG header buffers locked so we can keep going. */
- if (sc->sa.agi_bp)
+ /*
+ * Keep the AG header buffers locked while we roll the transaction.
+ * Ensure that both AG buffers are dirty and held when we roll the
+ * transaction so that they move forward in the log without losing the
+ * bli (and hence the bli type) when the transaction commits.
+ *
+ * Normal code would never hold clean buffers across a roll, but repair
+ * needs both buffers to maintain a total lock on the AG.
+ */
+ if (sc->sa.agi_bp) {
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
- if (sc->sa.agf_bp)
+ }
+
+ if (sc->sa.agf_bp) {
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
- if (sc->sa.agfl_bp)
- xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
+ }
/*
- * Roll the transaction. We still own the buffer and the buffer lock
- * regardless of whether or not the roll succeeds. If the roll fails,
- * the buffers will be released during teardown on our way out of the
- * kernel. If it succeeds, we join them to the new transaction and
- * move on.
+ * Roll the transaction. We still hold the AG header buffers locked
+ * regardless of whether or not that succeeds. On failure, the buffers
+ * will be released during teardown on our way out of the kernel. If
+ * successful, join the buffers to the new transaction and move on.
*/
error = xfs_trans_roll(&sc->tp);
if (error)
return error;
- /* Join AG headers to the new transaction. */
+ /* Join the AG headers to the new transaction. */
if (sc->sa.agi_bp)
xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
if (sc->sa.agf_bp)
xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
- if (sc->sa.agfl_bp)
- xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
return 0;
}
@@ -498,6 +510,7 @@ xrep_put_freelist(
struct xfs_scrub *sc,
xfs_agblock_t agbno)
{
+ struct xfs_buf *agfl_bp;
int error;
/* Make sure there's space on the freelist. */
@@ -516,8 +529,12 @@ xrep_put_freelist(
return error;
/* Put the block on the AGFL. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ return error;
+
error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
- sc->sa.agfl_bp, agbno, 0);
+ agfl_bp, agbno, 0);
if (error)
return error;
xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 2e8e400f10a9..07a7a75f987f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -174,7 +174,7 @@ xchk_teardown(
if (sc->flags & XCHK_REAPING_DISABLED)
xchk_start_reaping(sc);
if (sc->buf) {
- kmem_free(sc->buf);
+ kvfree(sc->buf);
sc->buf = NULL;
}
return error;
@@ -467,7 +467,7 @@ xfs_scrub_metadata(
xfs_warn_mount(mp, XFS_OPSTATE_WARNED_SCRUB,
"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
- sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL);
+ sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
if (!sc) {
error = -ENOMEM;
goto out;
@@ -557,7 +557,7 @@ out_nofix:
out_teardown:
error = xchk_teardown(sc, error);
out_sc:
- kmem_free(sc);
+ kfree(sc);
out:
trace_xchk_done(XFS_I(file_inode(file)), sm, error);
if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 3de5287e98d8..b4d391b4c938 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -8,6 +8,15 @@
struct xfs_scrub;
+/*
+ * Standard flags for allocating memory within scrub. NOFS context is
+ * configured by the process allocation scope. Scrub and repair must be able
+ * to back out gracefully if there isn't enough memory. Force-cast to avoid
+ * complaints from static checkers.
+ */
+#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
+ __GFP_RETRY_MAYFAIL))
+
/* Type info and names for the scrub types. */
enum xchk_type {
ST_NONE = 1, /* disabled */
@@ -39,7 +48,6 @@ struct xchk_ag {
/* AG btree roots */
struct xfs_buf *agf_bp;
- struct xfs_buf *agfl_bp;
struct xfs_buf *agi_bp;
/* AG btrees */
@@ -161,12 +169,4 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
# define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
#endif
-struct xchk_fscounters {
- uint64_t icount;
- uint64_t ifree;
- uint64_t fdblocks;
- unsigned long long icount_min;
- unsigned long long icount_max;
-};
-
#endif /* __XFS_SCRUB_SCRUB_H__ */
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 75311f8daeeb..c1c99ffe7408 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -21,7 +21,7 @@ xchk_setup_symlink(
struct xfs_scrub *sc)
{
/* Allocate the buffer without the inode lock held. */
- sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, GFP_KERNEL);
+ sc->buf = kvzalloc(XFS_SYMLINK_MAXLEN + 1, XCHK_GFP_FLAGS);
if (!sc->buf)
return -ENOMEM;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 5d1a995b15f8..41734202796f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -17,6 +17,8 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_reflink.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -114,9 +116,8 @@ xfs_end_ioend(
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- xfs_bmap_punch_delalloc_range(ip,
- XFS_B_TO_FSBT(mp, offset),
- XFS_B_TO_FSB(mp, size));
+ xfs_bmap_punch_delalloc_range(ip, offset,
+ offset + size);
}
goto done;
}
@@ -218,11 +219,17 @@ xfs_imap_valid(
* checked (and found nothing at this offset) could have added
* overlapping blocks.
*/
- if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
+ if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq)) {
+ trace_xfs_wb_data_iomap_invalid(ip, &wpc->iomap,
+ XFS_WPC(wpc)->data_seq, XFS_DATA_FORK);
return false;
+ }
if (xfs_inode_has_cow_data(ip) &&
- XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+ XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq)) {
+ trace_xfs_wb_cow_iomap_invalid(ip, &wpc->iomap,
+ XFS_WPC(wpc)->cow_seq, XFS_COW_FORK);
return false;
+ }
return true;
}
@@ -286,6 +293,8 @@ xfs_map_blocks(
if (xfs_is_shutdown(mp))
return -EIO;
+ XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
+
/*
* COW fork blocks can overlap data fork blocks even if the blocks
* aren't shared. COW I/O always takes precedent, so we must always
@@ -373,7 +382,7 @@ retry:
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
@@ -455,12 +464,8 @@ xfs_discard_folio(
struct folio *folio,
loff_t pos)
{
- struct inode *inode = folio->mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(folio->mapping->host);
struct xfs_mount *mp = ip->i_mount;
- size_t offset = offset_in_folio(folio, pos);
- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos);
- xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
int error;
if (xfs_is_shutdown(mp))
@@ -470,8 +475,9 @@ xfs_discard_folio(
"page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
folio, ip->i_ino, pos);
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- i_blocks_per_folio(inode, folio) - pageoff_fsb);
+ error = xfs_bmap_punch_delalloc_range(ip, pos,
+ round_up(pos, folio_size(folio)));
+
if (error && !xfs_is_shutdown(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 04d0c2bff67c..867645b74d88 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -590,11 +590,13 @@ out_unlock_iolock:
int
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
- xfs_fileoff_t start_fsb,
- xfs_fileoff_t length)
+ xfs_off_t start_byte,
+ xfs_off_t end_byte)
{
+ struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = &ip->i_df;
- xfs_fileoff_t end_fsb = start_fsb + length;
+ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
int error = 0;
@@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range(
while (got.br_startoff + got.br_blockcount > start_fsb) {
del = got;
- xfs_trim_extent(&del, start_fsb, length);
+ xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb);
/*
* A delete can push the cursor forward. Step back to the
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 24b37d211f1d..6888078f5c31 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */
int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
- xfs_fileoff_t start_fsb, xfs_fileoff_t length);
+ xfs_off_t start_byte, xfs_off_t end_byte);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index dde346450952..54c774af6e1c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1945,6 +1945,7 @@ xfs_free_buftarg(
list_lru_destroy(&btp->bt_lru);
blkdev_issue_flush(btp->bt_bdev);
+ invalidate_bdev(btp->bt_bdev);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
kmem_free(btp);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 522d450a94b1..df7322ed73fa 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1018,6 +1018,8 @@ xfs_buf_item_relse(
trace_xfs_buf_item_relse(bp, _RET_IP_);
ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
+ if (atomic_read(&bip->bli_refcount))
+ return;
bp->b_log_item = NULL;
xfs_buf_rele(bp);
xfs_buf_item_free(bip);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 822e6a0e9d1a..ae082808cfed 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_REFCOUNT_FINISH_ONE,
XFS_RANDOM_BMAP_FINISH_ONE,
XFS_RANDOM_AG_RESV_CRITICAL,
- XFS_RANDOM_DROP_WRITES,
+ 0, /* XFS_RANDOM_DROP_WRITES has been removed */
XFS_RANDOM_LOG_BAD_CRC,
XFS_RANDOM_LOG_ITEM_PIN,
XFS_RANDOM_BUF_LRU_REF,
@@ -60,6 +60,8 @@ static unsigned int xfs_errortag_random_default[] = {
XFS_RANDOM_LARP,
XFS_RANDOM_DA_LEAF_SPLIT,
XFS_RANDOM_ATTR_LEAF_TO_NODE,
+ XFS_RANDOM_WB_DELAY_MS,
+ XFS_RANDOM_WRITE_DELAY_MS,
};
struct xfs_errortag_attr {
@@ -162,7 +164,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_update, XFS_ERRTAG_REFCOUNT_CONTINUE_UPDA
XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE);
XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
-XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
@@ -176,6 +177,8 @@ XFS_ERRORTAG_ATTR_RW(ag_resv_fail, XFS_ERRTAG_AG_RESV_FAIL);
XFS_ERRORTAG_ATTR_RW(larp, XFS_ERRTAG_LARP);
XFS_ERRORTAG_ATTR_RW(da_leaf_split, XFS_ERRTAG_DA_LEAF_SPLIT);
XFS_ERRORTAG_ATTR_RW(attr_leaf_to_node, XFS_ERRTAG_ATTR_LEAF_TO_NODE);
+XFS_ERRORTAG_ATTR_RW(wb_delay_ms, XFS_ERRTAG_WB_DELAY_MS);
+XFS_ERRORTAG_ATTR_RW(write_delay_ms, XFS_ERRTAG_WRITE_DELAY_MS);
static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(noerror),
@@ -206,7 +209,6 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
- XFS_ERRORTAG_ATTR_LIST(drop_writes),
XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
XFS_ERRORTAG_ATTR_LIST(log_item_pin),
XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
@@ -220,6 +222,8 @@ static struct attribute *xfs_errortag_attrs[] = {
XFS_ERRORTAG_ATTR_LIST(larp),
XFS_ERRORTAG_ATTR_LIST(da_leaf_split),
XFS_ERRORTAG_ATTR_LIST(attr_leaf_to_node),
+ XFS_ERRORTAG_ATTR_LIST(wb_delay_ms),
+ XFS_ERRORTAG_ATTR_LIST(write_delay_ms),
NULL,
};
ATTRIBUTE_GROUPS(xfs_errortag);
@@ -256,6 +260,32 @@ xfs_errortag_del(
kmem_free(mp->m_errortag);
}
+static bool
+xfs_errortag_valid(
+ unsigned int error_tag)
+{
+ if (error_tag >= XFS_ERRTAG_MAX)
+ return false;
+
+ /* Error out removed injection types */
+ if (error_tag == XFS_ERRTAG_DROP_WRITES)
+ return false;
+ return true;
+}
+
+bool
+xfs_errortag_enabled(
+ struct xfs_mount *mp,
+ unsigned int tag)
+{
+ if (!mp->m_errortag)
+ return false;
+ if (!xfs_errortag_valid(tag))
+ return false;
+
+ return mp->m_errortag[tag] != 0;
+}
+
bool
xfs_errortag_test(
struct xfs_mount *mp,
@@ -277,7 +307,9 @@ xfs_errortag_test(
if (!mp->m_errortag)
return false;
- ASSERT(error_tag < XFS_ERRTAG_MAX);
+ if (!xfs_errortag_valid(error_tag))
+ return false;
+
randfactor = mp->m_errortag[error_tag];
if (!randfactor || get_random_u32_below(randfactor))
return false;
@@ -293,7 +325,7 @@ xfs_errortag_get(
struct xfs_mount *mp,
unsigned int error_tag)
{
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
return mp->m_errortag[error_tag];
@@ -305,7 +337,7 @@ xfs_errortag_set(
unsigned int error_tag,
unsigned int tag_value)
{
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
mp->m_errortag[error_tag] = tag_value;
@@ -319,7 +351,7 @@ xfs_errortag_add(
{
BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
- if (error_tag >= XFS_ERRTAG_MAX)
+ if (!xfs_errortag_valid(error_tag))
return -EINVAL;
return xfs_errortag_set(mp, error_tag,
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 5191e9145e55..dbe6c37dc697 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -45,6 +45,18 @@ extern bool xfs_errortag_test(struct xfs_mount *mp, const char *expression,
const char *file, int line, unsigned int error_tag);
#define XFS_TEST_ERROR(expr, mp, tag) \
((expr) || xfs_errortag_test((mp), #expr, __FILE__, __LINE__, (tag)))
+bool xfs_errortag_enabled(struct xfs_mount *mp, unsigned int tag);
+#define XFS_ERRORTAG_DELAY(mp, tag) \
+ do { \
+ might_sleep(); \
+ if (!xfs_errortag_enabled((mp), (tag))) \
+ break; \
+ xfs_warn_ratelimited((mp), \
+"Injecting %ums delay at file %s, line %d, on filesystem \"%s\"", \
+ (mp)->m_errortag[(tag)], __FILE__, __LINE__, \
+ (mp)->m_super->s_id); \
+ mdelay((mp)->m_errortag[(tag)]); \
+ } while (0)
extern int xfs_errortag_get(struct xfs_mount *mp, unsigned int error_tag);
extern int xfs_errortag_set(struct xfs_mount *mp, unsigned int error_tag,
@@ -55,6 +67,7 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp);
#define xfs_errortag_init(mp) (0)
#define xfs_errortag_del(mp)
#define XFS_TEST_ERROR(expr, mp, tag) (expr)
+#define XFS_ERRORTAG_DELAY(mp, tag) ((void)0)
#define xfs_errortag_set(mp, tag, val) (ENOSYS)
#define xfs_errortag_add(mp, tag) (ENOSYS)
#define xfs_errortag_clearall(mp) (ENOSYS)
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ad22a003f959..f3d328e4a440 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -236,6 +236,7 @@ xfs_extent_busy_update_extent(
*
*/
busyp->bno = fend;
+ busyp->length = bend - fend;
} else if (bbno < fbno) {
/*
* Case 8:
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e462d39c840e..595a5bcf46b9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1325,7 +1325,7 @@ __xfs_filemap_fault(
if (write_fault) {
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = iomap_page_mkwrite(vmf,
- &xfs_buffered_write_iomap_ops);
+ &xfs_page_mkwrite_iomap_ops);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
} else {
ret = filemap_fault(vmf);
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index d8337274c74d..88a88506ffff 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -524,7 +524,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
struct xfs_mount *mp = tp->t_mountp;
int error;
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
/*
* Set up query parameters to return free rtextents covering the range
@@ -551,7 +551,7 @@ xfs_getfsmap_rtdev_rtbitmap_query(
if (error)
goto err;
err:
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
return error;
}
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index eae7427062cf..ddeaccc04aec 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -342,6 +342,9 @@ xfs_iget_recycle(
trace_xfs_iget_recycle(ip);
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+ return -EAGAIN;
+
/*
* We need to make it look like the inode is being reclaimed to prevent
* the actual reclaim workers from stomping over us while we recycle
@@ -355,6 +358,7 @@ xfs_iget_recycle(
ASSERT(!rwsem_is_locked(&inode->i_rwsem));
error = xfs_reinit_inode(mp, inode);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error) {
/*
* Re-initializing the inode failed, and we are in deep
@@ -518,6 +522,8 @@ xfs_iget_cache_hit(
if (ip->i_flags & XFS_IRECLAIMABLE) {
/* Drops i_flags_lock and RCU read lock. */
error = xfs_iget_recycle(pag, ip);
+ if (error == -EAGAIN)
+ goto out_skip;
if (error)
return error;
} else {
@@ -1847,12 +1853,20 @@ xfs_inodegc_worker(
struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
+ unsigned int nofs_flag;
WRITE_ONCE(gc->items, 0);
if (!node)
return;
+ /*
+ * We can allocate memory here while doing writeback on behalf of
+ * memory reclaim. To avoid memory allocation deadlocks set the
+ * task-wide nofs context for the following operations.
+ */
+ nofs_flag = memalloc_nofs_save();
+
ip = llist_entry(node, struct xfs_inode, i_gclist);
trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
@@ -1861,6 +1875,8 @@ xfs_inodegc_worker(
xfs_iflags_set(ip, XFS_INACTIVATING);
xfs_inodegc_inactivate(ip);
}
+
+ memalloc_nofs_restore(nofs_flag);
}
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index aa303be11576..d354ea2b74f9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2479,7 +2479,7 @@ xfs_remove(
error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
tp->t_mountp->m_sb.sb_rootino, 0);
if (error)
- return error;
+ goto out_trans_cancel;
}
} else {
/*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 13f1b2add390..736510bc241b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -754,7 +754,7 @@ xfs_bulkstat_fmt(
static int
xfs_bulk_ireq_setup(
struct xfs_mount *mp,
- struct xfs_bulk_ireq *hdr,
+ const struct xfs_bulk_ireq *hdr,
struct xfs_ibulk *breq,
void __user *ubuffer)
{
@@ -780,7 +780,7 @@ xfs_bulk_ireq_setup(
switch (hdr->ino) {
case XFS_BULK_IREQ_SPECIAL_ROOT:
- hdr->ino = mp->m_sb.sb_rootino;
+ breq->startino = mp->m_sb.sb_rootino;
break;
default:
return -EINVAL;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index d9401d0300ad..fc1946f80a4a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -48,13 +48,53 @@ xfs_alert_fsblock_zero(
return -EFSCORRUPTED;
}
+u64
+xfs_iomap_inode_sequence(
+ struct xfs_inode *ip,
+ u16 iomap_flags)
+{
+ u64 cookie = 0;
+
+ if (iomap_flags & IOMAP_F_XATTR)
+ return READ_ONCE(ip->i_af.if_seq);
+ if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp)
+ cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32;
+ return cookie | READ_ONCE(ip->i_df.if_seq);
+}
+
+/*
+ * Check that the iomap passed to us is still valid for the given offset and
+ * length.
+ */
+static bool
+xfs_iomap_valid(
+ struct inode *inode,
+ const struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (iomap->validity_cookie !=
+ xfs_iomap_inode_sequence(ip, iomap->flags)) {
+ trace_xfs_iomap_invalid(ip, iomap);
+ return false;
+ }
+
+ XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS);
+ return true;
+}
+
+static const struct iomap_page_ops xfs_iomap_page_ops = {
+ .iomap_valid = xfs_iomap_valid,
+};
+
int
xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
unsigned int mapping_flags,
- u16 iomap_flags)
+ u16 iomap_flags,
+ u64 sequence_cookie)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_buftarg *target = xfs_inode_buftarg(ip);
@@ -91,6 +131,9 @@ xfs_bmbt_to_iomap(
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
+
+ iomap->validity_cookie = sequence_cookie;
+ iomap->page_ops = &xfs_iomap_page_ops;
return 0;
}
@@ -195,7 +238,8 @@ xfs_iomap_write_direct(
xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb,
unsigned int flags,
- struct xfs_bmbt_irec *imap)
+ struct xfs_bmbt_irec *imap,
+ u64 *seq)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -285,6 +329,7 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
+ *seq = xfs_iomap_inode_sequence(ip, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
@@ -743,6 +788,7 @@ xfs_direct_write_iomap_begin(
bool shared = false;
u16 iomap_flags = 0;
unsigned int lockmode = XFS_ILOCK_SHARED;
+ u64 seq;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
@@ -811,9 +857,10 @@ xfs_direct_write_iomap_begin(
goto out_unlock;
}
+ seq = xfs_iomap_inode_sequence(ip, iomap_flags);
xfs_iunlock(ip, lockmode);
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
allocate_blocks:
error = -EAGAIN;
@@ -839,24 +886,26 @@ allocate_blocks:
xfs_iunlock(ip, lockmode);
error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
- flags, &imap);
+ flags, &imap, &seq);
if (error)
return error;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
- iomap_flags | IOMAP_F_NEW);
+ iomap_flags | IOMAP_F_NEW, seq);
out_found_cow:
- xfs_iunlock(ip, lockmode);
length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
if (imap.br_startblock != HOLESTARTBLOCK) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+ seq = xfs_iomap_inode_sequence(ip, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
if (error)
- return error;
+ goto out_unlock;
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, lockmode);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
out_unlock:
if (lockmode)
@@ -915,6 +964,7 @@ xfs_buffered_write_iomap_begin(
int allocfork = XFS_DATA_FORK;
int error = 0;
unsigned int lockmode = XFS_ILOCK_EXCL;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -926,6 +976,10 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip));
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
@@ -1029,10 +1083,6 @@ xfs_buffered_write_iomap_begin(
allocfork = XFS_COW_FORK;
}
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out_unlock;
-
if (eof && offset + count > XFS_ISIZE(ip)) {
/*
* Determine the initial size of the preallocation.
@@ -1094,26 +1144,31 @@ retry:
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
found_imap:
+ seq = xfs_iomap_inode_sequence(ip, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
found_cow:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
if (error)
- return error;
+ goto out_unlock;
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
- IOMAP_F_SHARED);
+ IOMAP_F_SHARED, seq);
}
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1121,6 +1176,16 @@ out_unlock:
}
static int
+xfs_buffered_write_delalloc_punch(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length)
+{
+ return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset,
+ offset + length);
+}
+
+static int
xfs_buffered_write_iomap_end(
struct inode *inode,
loff_t offset,
@@ -1129,56 +1194,17 @@ xfs_buffered_write_iomap_end(
unsigned flags,
struct iomap *iomap)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_fileoff_t start_fsb;
- xfs_fileoff_t end_fsb;
- int error = 0;
-
- if (iomap->type != IOMAP_DELALLOC)
- return 0;
-
- /*
- * Behave as if the write failed if drop writes is enabled. Set the NEW
- * flag to force delalloc cleanup.
- */
- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
- iomap->flags |= IOMAP_F_NEW;
- written = 0;
- }
- /*
- * start_fsb refers to the first unused block after a short write. If
- * nothing was written, round offset down to point at the first block in
- * the range.
- */
- if (unlikely(!written))
- start_fsb = XFS_B_TO_FSBT(mp, offset);
- else
- start_fsb = XFS_B_TO_FSB(mp, offset + written);
- end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ struct xfs_mount *mp = XFS_M(inode->i_sb);
+ int error;
- /*
- * Trim delalloc blocks if they were allocated by this write and we
- * didn't manage to write the whole range.
- *
- * We don't need to care about racing delalloc as we hold i_mutex
- * across the reserve/allocate/unreserve calls. If there are delalloc
- * blocks in the range, they are ours.
- */
- if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
- truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
- XFS_FSB_TO_B(mp, end_fsb) - 1);
-
- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
- end_fsb - start_fsb);
- if (error && !xfs_is_shutdown(mp)) {
- xfs_alert(mp, "%s: unable to clean up ino %lld",
- __func__, ip->i_ino);
- return error;
- }
+ error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
+ length, written, &xfs_buffered_write_delalloc_punch);
+ if (error && !xfs_is_shutdown(mp)) {
+ xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
+ __func__, XFS_I(inode)->i_ino);
+ return error;
}
-
return 0;
}
@@ -1187,6 +1213,15 @@ const struct iomap_ops xfs_buffered_write_iomap_ops = {
.iomap_end = xfs_buffered_write_iomap_end,
};
+/*
+ * iomap_page_mkwrite() will never fail in a way that requires delalloc extents
+ * that it allocated to be revoked. Hence we do not need an .iomap_end method
+ * for this operation.
+ */
+const struct iomap_ops xfs_page_mkwrite_iomap_ops = {
+ .iomap_begin = xfs_buffered_write_iomap_begin,
+};
+
static int
xfs_read_iomap_begin(
struct inode *inode,
@@ -1204,6 +1239,7 @@ xfs_read_iomap_begin(
int nimaps = 1, error = 0;
bool shared = false;
unsigned int lockmode = XFS_ILOCK_SHARED;
+ u64 seq;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
@@ -1217,13 +1253,14 @@ xfs_read_iomap_begin(
&nimaps, 0);
if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode)))
error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+ seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0);
xfs_iunlock(ip, lockmode);
if (error)
return error;
trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
- shared ? IOMAP_F_SHARED : 0);
+ shared ? IOMAP_F_SHARED : 0, seq);
}
const struct iomap_ops xfs_read_iomap_ops = {
@@ -1248,6 +1285,7 @@ xfs_seek_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
int error = 0;
unsigned lockmode;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -1282,8 +1320,9 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
- IOMAP_F_SHARED);
+ IOMAP_F_SHARED, seq);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
@@ -1304,8 +1343,9 @@ xfs_seek_iomap_begin(
imap.br_startblock = HOLESTARTBLOCK;
imap.br_state = XFS_EXT_NORM;
done:
+ seq = xfs_iomap_inode_sequence(ip, 0);
xfs_trim_extent(&imap, offset_fsb, end_fsb);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
@@ -1331,6 +1371,7 @@ xfs_xattr_iomap_begin(
struct xfs_bmbt_irec imap;
int nimaps = 1, error = 0;
unsigned lockmode;
+ int seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -1347,12 +1388,14 @@ xfs_xattr_iomap_begin(
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
&nimaps, XFS_BMAPI_ATTRFORK);
out_unlock:
+
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR);
xfs_iunlock(ip, lockmode);
if (error)
return error;
ASSERT(nimaps);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index c782e8c0479c..4da13440bae9 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -13,14 +13,15 @@ struct xfs_bmbt_irec;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
- struct xfs_bmbt_irec *imap);
+ struct xfs_bmbt_irec *imap, u64 *sequence);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
+u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags);
int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
- u16 iomap_flags);
+ u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
bool *did_zero);
@@ -47,6 +48,7 @@ xfs_aligned_fsb_count(
}
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
+extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f02a0dd522b3..fc61cc024023 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -644,12 +644,14 @@ xfs_log_mount(
int min_logfsbs;
if (!xfs_has_norecovery(mp)) {
- xfs_notice(mp, "Mounting V%d Filesystem",
- XFS_SB_VERSION_NUM(&mp->m_sb));
+ xfs_notice(mp, "Mounting V%d Filesystem %pU",
+ XFS_SB_VERSION_NUM(&mp->m_sb),
+ &mp->m_sb.sb_uuid);
} else {
xfs_notice(mp,
-"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
- XFS_SB_VERSION_NUM(&mp->m_sb));
+"Mounting V%d filesystem %pU in no-recovery mode. Filesystem will be inconsistent.",
+ XFS_SB_VERSION_NUM(&mp->m_sb),
+ &mp->m_sb.sb_uuid);
ASSERT(xfs_is_readonly(mp));
}
@@ -887,6 +889,23 @@ xlog_force_iclog(
}
/*
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
+ */
+static void
+xlog_wait_iclog_completion(struct xlog *log)
+{
+ int i;
+ struct xlog_in_core *iclog = log->l_iclog;
+
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ down(&iclog->ic_sema);
+ up(&iclog->ic_sema);
+ iclog = iclog->ic_next;
+ }
+}
+
+/*
* Wait for the iclog and all prior iclogs to be written disk as required by the
* log force state machine. Waiting on ic_force_wait ensures iclog completions
* have been ordered and callbacks run before we are woken here, hence
@@ -1111,6 +1130,14 @@ xfs_log_unmount(
{
xfs_log_clean(mp);
+ /*
+ * If shutdown has come from iclog IO context, the log
+ * cleaning will have been skipped and so we need to wait
+ * for the iclog to complete shutdown processing before we
+ * tear anything down.
+ */
+ xlog_wait_iclog_completion(mp->m_log);
+
xfs_buftarg_drain(mp->m_ddev_targp);
xfs_trans_ail_destroy(mp);
@@ -2114,17 +2141,6 @@ xlog_dealloc_log(
int i;
/*
- * Cycle all the iclogbuf locks to make sure all log IO completion
- * is done before we tear down these buffers.
- */
- iclog = log->l_iclog;
- for (i = 0; i < log->l_iclog_bufs; i++) {
- down(&iclog->ic_sema);
- up(&iclog->ic_sema);
- iclog = iclog->ic_next;
- }
-
- /*
* Destroy the CIL after waiting for iclog IO completion because an
* iclog EIO error will try to shut down the log, which accesses the
* CIL to wake up the waiters.
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e8bb3c2e847e..fb87ffb48f7f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -538,6 +538,20 @@ xfs_check_summary_counts(
return 0;
}
+static void
+xfs_unmount_check(
+ struct xfs_mount *mp)
+{
+ if (xfs_is_shutdown(mp))
+ return;
+
+ if (percpu_counter_sum(&mp->m_ifree) >
+ percpu_counter_sum(&mp->m_icount)) {
+ xfs_alert(mp, "ifree/icount mismatch at unmount");
+ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
+ }
+}
+
/*
* Flush and reclaim dirty inodes in preparation for unmount. Inodes and
* internal inode structures can be sitting in the CIL and AIL at this point,
@@ -1077,6 +1091,7 @@ xfs_unmountfs(
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
+ xfs_unmount_check(mp);
xfs_log_unmount(mp);
xfs_da_unmount(mp);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 37a24f0f7cd4..38d23f0e703a 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -125,6 +125,7 @@ xfs_fs_map_blocks(
int nimaps = 1;
uint lock_flags;
int error = 0;
+ u64 seq;
if (xfs_is_shutdown(mp))
return -EIO;
@@ -176,6 +177,7 @@ xfs_fs_map_blocks(
lock_flags = xfs_ilock_data_map_shared(ip);
error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
&imap, &nimaps, bmapi_flags);
+ seq = xfs_iomap_inode_sequence(ip, 0);
ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
@@ -189,7 +191,7 @@ xfs_fs_map_blocks(
xfs_iunlock(ip, lock_flags);
error = xfs_iomap_write_direct(ip, offset_fsb,
- end_fsb - offset_fsb, 0, &imap);
+ end_fsb - offset_fsb, 0, &imap, &seq);
if (error)
goto out_unlock;
@@ -209,7 +211,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
+ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
*device_generation = mp->m_generation;
return error;
out_unlock:
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 18bb4ec4d7c9..e2c542f6dcd4 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -68,7 +68,7 @@ restart:
while (1) {
struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH];
- int error = 0;
+ int error;
int i;
mutex_lock(&qi->qi_tree_lock);
@@ -423,6 +423,14 @@ xfs_qm_dquot_isolate(
goto out_miss_busy;
/*
+ * If something else is freeing this dquot and hasn't yet removed it
+ * from the LRU, leave it for the freeing task to complete the freeing
+ * process rather than risk it being free from under us here.
+ */
+ if (dqp->q_flags & XFS_DQFLAG_FREEING)
+ goto out_miss_unlock;
+
+ /*
* This dquot has acquired a reference in the meantime remove it from
* the freelist and try again.
*/
@@ -441,10 +449,8 @@ xfs_qm_dquot_isolate(
* skip it so there is time for the IO to complete before we try to
* reclaim it again on the next LRU pass.
*/
- if (!xfs_dqflock_nowait(dqp)) {
- xfs_dqunlock(dqp);
- goto out_miss_busy;
- }
+ if (!xfs_dqflock_nowait(dqp))
+ goto out_miss_unlock;
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
@@ -478,6 +484,8 @@ xfs_qm_dquot_isolate(
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
return LRU_REMOVED;
+out_miss_unlock:
+ xfs_dqunlock(dqp);
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index fe46bce8cae6..5535778a98f9 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -416,8 +416,6 @@ xfs_reflink_fill_cow_hole(
goto convert;
}
- ASSERT(cmap->br_startoff > imap->br_startoff);
-
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 292d5e54a92c..16534e9873f6 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1311,10 +1311,10 @@ xfs_rtalloc_reinit_frextents(
uint64_t val = 0;
int error;
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
error = xfs_rtalloc_query_all(mp, NULL, xfs_rtalloc_count_frextent,
&val);
- xfs_iunlock(mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
if (error)
return error;
@@ -1326,6 +1326,41 @@ xfs_rtalloc_reinit_frextents(
}
/*
+ * Read in the bmbt of an rt metadata inode so that we never have to load them
+ * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
+ * an empty transaction to avoid deadlocking on loops in the bmbt.
+ */
+static inline int
+xfs_rtmount_iread_extents(
+ struct xfs_inode *ip,
+ unsigned int lock_class)
+{
+ struct xfs_trans *tp;
+ int error;
+
+ error = xfs_trans_alloc_empty(ip->i_mount, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL | lock_class);
+
+ error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ if (xfs_inode_has_attr_fork(ip)) {
+ error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
+ if (error)
+ goto out_unlock;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | lock_class);
+ xfs_trans_cancel(tp);
+ return error;
+}
+
+/*
* Get the bitmap and summary inodes and the summary cache into the mount
* structure at mount time.
*/
@@ -1342,14 +1377,27 @@ xfs_rtmount_inodes(
return error;
ASSERT(mp->m_rbmip != NULL);
+ error = xfs_rtmount_iread_extents(mp->m_rbmip, XFS_ILOCK_RTBITMAP);
+ if (error)
+ goto out_rele_bitmap;
+
error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
- if (error) {
- xfs_irele(mp->m_rbmip);
- return error;
- }
+ if (error)
+ goto out_rele_bitmap;
ASSERT(mp->m_rsumip != NULL);
+
+ error = xfs_rtmount_iread_extents(mp->m_rsumip, XFS_ILOCK_RTSUM);
+ if (error)
+ goto out_rele_summary;
+
xfs_alloc_rsum_cache(mp, sbp->sb_rbmblocks);
return 0;
+
+out_rele_summary:
+ xfs_irele(mp->m_rsumip);
+out_rele_bitmap:
+ xfs_irele(mp->m_rbmip);
+ return error;
}
void
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ee4b429a2f2c..0c4b73e9b29d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1110,7 +1110,7 @@ xfs_fs_put_super(
if (!sb->s_fs_info)
return;
- xfs_notice(mp, "Unmounting Filesystem");
+ xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid);
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index d269ef57ff01..8a5dc1538aa8 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -34,6 +34,8 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_error.h"
+#include <linux/iomap.h>
+#include "xfs_iomap.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 372d871bccc5..421d1e504ac4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3352,6 +3352,92 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
TP_ARGS(ip, irec))
+/* inode iomap invalidation events */
+DECLARE_EVENT_CLASS(xfs_wb_invalid_class,
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork),
+ TP_ARGS(ip, iomap, wpcseq, whichfork),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(u64, addr)
+ __field(loff_t, pos)
+ __field(u64, len)
+ __field(u16, type)
+ __field(u16, flags)
+ __field(u32, wpcseq)
+ __field(u32, forkseq)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->pos = iomap->offset;
+ __entry->len = iomap->length;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->wpcseq = wpcseq;
+ __entry->forkseq = READ_ONCE(xfs_ifork_ptr(ip, whichfork)->if_seq);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x wpcseq 0x%x forkseq 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pos,
+ __entry->addr,
+ __entry->len,
+ __entry->type,
+ __entry->flags,
+ __entry->wpcseq,
+ __entry->forkseq)
+);
+#define DEFINE_WB_INVALID_EVENT(name) \
+DEFINE_EVENT(xfs_wb_invalid_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap, unsigned int wpcseq, int whichfork), \
+ TP_ARGS(ip, iomap, wpcseq, whichfork))
+DEFINE_WB_INVALID_EVENT(xfs_wb_cow_iomap_invalid);
+DEFINE_WB_INVALID_EVENT(xfs_wb_data_iomap_invalid);
+
+DECLARE_EVENT_CLASS(xfs_iomap_invalid_class,
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap),
+ TP_ARGS(ip, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(u64, addr)
+ __field(loff_t, pos)
+ __field(u64, len)
+ __field(u64, validity_cookie)
+ __field(u64, inodeseq)
+ __field(u16, type)
+ __field(u16, flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->addr = iomap->addr;
+ __entry->pos = iomap->offset;
+ __entry->len = iomap->length;
+ __entry->validity_cookie = iomap->validity_cookie;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->inodeseq = xfs_iomap_inode_sequence(ip, iomap->flags);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx addr 0x%llx bytecount 0x%llx type 0x%x flags 0x%x validity_cookie 0x%llx inodeseq 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->pos,
+ __entry->addr,
+ __entry->len,
+ __entry->type,
+ __entry->flags,
+ __entry->validity_cookie,
+ __entry->inodeseq)
+);
+#define DEFINE_IOMAP_INVALID_EVENT(name) \
+DEFINE_EVENT(xfs_iomap_invalid_class, name, \
+ TP_PROTO(struct xfs_inode *ip, const struct iomap *iomap), \
+ TP_ARGS(ip, iomap))
+DEFINE_IOMAP_INVALID_EVENT(xfs_iomap_invalid);
+
/* refcount/reflink tracepoint definitions */
/* reflink tracepoints */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index f51df7d94ef7..7d4109af193e 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -422,7 +422,7 @@ xfsaild_push(
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
- xfs_lsn_t target;
+ xfs_lsn_t target = NULLCOMMITLSN;
long tout;
int stuck = 0;
int flushing = 0;
@@ -472,6 +472,8 @@ xfsaild_push(
XFS_STATS_INC(mp, xs_push_ail);
+ ASSERT(target != NULLCOMMITLSN);
+
lsn = lip->li_lsn;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
int lock_result;
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c325a28b89a8..10aa1fd39d2b 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -210,7 +210,7 @@ __xfs_xattr_put_listent(
return;
}
offset = context->buffer + context->count;
- strncpy(offset, prefix, prefix_len);
+ memcpy(offset, prefix, prefix_len);
offset += prefix_len;
strncpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 2c53fbb8d918..a9c5c3f720ad 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -442,6 +442,10 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
data_size = zonefs_check_zone_condition(inode, zone,
false, false);
}
+ } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO &&
+ data_size > isize) {
+ /* Do not expose garbage data */
+ data_size = isize;
}
/*
@@ -805,6 +809,24 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
ret = submit_bio_wait(bio);
+ /*
+ * If the file zone was written underneath the file system, the zone
+ * write pointer may not be where we expect it to be, but the zone
+ * append write can still succeed. So check manually that we wrote where
+ * we intended to, that is, at zi->i_wpoffset.
+ */
+ if (!ret) {
+ sector_t wpsector =
+ zi->i_zsector + (zi->i_wpoffset >> SECTOR_SHIFT);
+
+ if (bio->bi_iter.bi_sector != wpsector) {
+ zonefs_warn(inode->i_sb,
+ "Corrupted write pointer %llu for zone at %llu\n",
+ wpsector, zi->i_zsector);
+ ret = -EIO;
+ }
+ }
+
zonefs_file_write_dio_end_io(iocb, size, ret, 0);
trace_zonefs_file_dio_append(inode, size, ret);